use std::mem::take;

use crate::asm::{CodeBlock, Label};
use crate::asm::arm64::*;
use crate::codegen::split_patch_point;
use crate::cruby::*;
use crate::backend::lir::*;
use crate::options::asm_dump;
use crate::stats::CompileError;
use crate::virtualmem::CodePtr;
use crate::cast::*;

// Use the arm64 register type for this platform
pub type Reg = A64Reg;

/// Convert reg_no for MemBase::Reg into Reg, assuming it's a 64-bit register
pub fn mem_base_reg(reg_no: u8) -> Reg {
    Reg { num_bits: 64, reg_no }
}

// Callee-saved registers
pub const CFP: Opnd = Opnd::Reg(X19_REG);
pub const EC: Opnd = Opnd::Reg(X20_REG);
pub const SP: Opnd = Opnd::Reg(X21_REG);

// C argument registers on this platform
pub const C_ARG_OPNDS: [Opnd; 6] = [
    Opnd::Reg(X0_REG),
    Opnd::Reg(X1_REG),
    Opnd::Reg(X2_REG),
    Opnd::Reg(X3_REG),
    Opnd::Reg(X4_REG),
    Opnd::Reg(X5_REG)
];

// C return value register on this platform
pub const C_RET_REG: Reg = X0_REG;
pub const C_RET_OPND: Opnd = Opnd::Reg(X0_REG);
pub const NATIVE_STACK_PTR: Opnd = Opnd::Reg(XZR_REG);
pub const NATIVE_BASE_PTR: Opnd = Opnd::Reg(X29_REG);

// These constants define the way we work with Arm64's stack pointer. The stack
// pointer always needs to be aligned to a 16-byte boundary.
pub const C_SP_REG: A64Opnd = X31;
pub const C_SP_STEP: i32 = 16;

impl CodeBlock {
    // The maximum number of bytes that can be generated by emit_jmp_ptr.
    pub fn jmp_ptr_bytes(&self) -> usize {
        // b instruction's offset is encoded as imm26 times 4. It can jump to
        // +/-128MiB, so this can be used when --yjit-exec-mem-size <= 128.
        /*
        let num_insns = if b_offset_fits_bits(self.virtual_region_size() as i64 / 4) {
            1 // b instruction
        } else {
            5 // 4 instructions to load a 64-bit absolute address + br instruction
        };
        */
        let num_insns = 5; // TODO: support virtual_region_size() check
        num_insns * 4
    }

    // The maximum number of instructions that can be generated by emit_conditional_jump.
    fn conditional_jump_insns(&self) -> i32 {
        // The worst case is instructions for a jump + bcond.
        self.jmp_ptr_bytes() as i32 / 4 + 1
    }
}

/// Map Opnd to A64Opnd
impl From<Opnd> for A64Opnd {
    fn from(opnd: Opnd) -> Self {
        match opnd {
            Opnd::UImm(value) => A64Opnd::new_uimm(value),
            Opnd::Imm(value) => A64Opnd::new_imm(value),
            Opnd::Reg(reg) => A64Opnd::Reg(reg),
            Opnd::Mem(Mem { base: MemBase::Reg(reg_no), num_bits, disp }) => {
                A64Opnd::new_mem(num_bits, A64Opnd::Reg(A64Reg { num_bits, reg_no }), disp)
            },
            Opnd::Mem(Mem { base: MemBase::VReg(_), .. }) => {
                panic!("attempted to lower an Opnd::Mem with a MemBase::VReg base")
            },
            Opnd::Mem(Mem { base: MemBase::Stack { .. }, .. }) => {
                panic!("attempted to lower an Opnd::Mem with a MemBase::Stack base")
            },
            Opnd::VReg { .. } => panic!("attempted to lower an Opnd::VReg"),
            Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"),
            Opnd::None => panic!(
                "Attempted to lower an Opnd::None. This often happens when an out operand was not allocated for an instruction because the output of the instruction was not used. Please ensure you are using the output."
            ),
        }
    }
}

/// Also implement going from a reference to an operand for convenience.
impl From<&Opnd> for A64Opnd {
    fn from(opnd: &Opnd) -> Self {
        A64Opnd::from(*opnd)
    }
}

/// Call emit_jmp_ptr and immediately invalidate the written range.
/// This is needed when next_page also moves other_cb that is not invalidated
/// by compile_with_regs. Doing it here allows you to avoid invalidating a lot
/// more than necessary when other_cb jumps from a position early in the page.
/// This invalidates a small range of cb twice, but we accept the small cost.
fn emit_jmp_ptr_with_invalidation(cb: &mut CodeBlock, dst_ptr: CodePtr) {
    let start = cb.get_write_ptr();
    emit_jmp_ptr(cb, dst_ptr, true);
    let end = cb.get_write_ptr();
    unsafe { rb_jit_icache_invalidate(start.raw_ptr(cb) as _, end.raw_ptr(cb) as _) };
}

fn emit_jmp_ptr(cb: &mut CodeBlock, dst_ptr: CodePtr, padding: bool) {
    let src_addr = cb.get_write_ptr().as_offset();
    let dst_addr = dst_ptr.as_offset();

    // If the offset is short enough, then we'll use the
    // branch instruction. Otherwise, we'll move the
    // destination into a register and use the branch
    // register instruction.
    let num_insns = if b_offset_fits_bits((dst_addr - src_addr) / 4) {
        b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
        1
    } else {
        let num_insns = emit_load_value(cb, Assembler::EMIT_OPND, dst_addr as u64);
        br(cb, Assembler::EMIT_OPND);
        num_insns + 1
    };

    if padding {
        // Make sure it's always a consistent number of
        // instructions in case it gets patched and has to
        // use the other branch.
        assert!(num_insns * 4 <= cb.jmp_ptr_bytes());
        for _ in num_insns..(cb.jmp_ptr_bytes() / 4) {
            nop(cb);
        }
    }
}

/// Emit the required instructions to load the given value into the
/// given register. Our goal here is to use as few instructions as
/// possible to get this value into the register.
fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
    let mut current = value;

    if current <= 0xffff {
        // If the value fits into a single movz
        // instruction, then we'll use that.
        movz(cb, rd, A64Opnd::new_uimm(current), 0);
        1
    } else if u16::try_from(!value).is_ok() {
        // For small negative values, use a single movn
        movn(cb, rd, A64Opnd::new_uimm(!value), 0);
        1
    } else if BitmaskImmediate::try_from(current).is_ok() {
        // Otherwise, if the immediate can be encoded
        // with the special bitmask immediate encoding,
        // we'll use that.
        mov(cb, rd, A64Opnd::new_uimm(current));
        1
    } else {
        // Finally we'll fall back to encoding the value
        // using movz for the first 16 bits and movk for
        // each subsequent set of 16 bits as long we
        // they are necessary.
        movz(cb, rd, A64Opnd::new_uimm(current & 0xffff), 0);
        let mut num_insns = 1;

        // (We're sure this is necessary since we
        // checked if it only fit into movz above).
        current >>= 16;
        movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 16);
        num_insns += 1;

        if current > 0xffff {
            current >>= 16;
            movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 32);
            num_insns += 1;
        }

        if current > 0xffff {
            current >>= 16;
            movk(cb, rd, A64Opnd::new_uimm(current & 0xffff), 48);
            num_insns += 1;
        }
        num_insns
    }
}

/// List of registers that can be used for register allocation.
/// This has the same number of registers for x86_64 and arm64.
/// SCRATCH_OPND, SCRATCH1_OPND, and EMIT_OPND are excluded.
pub const ALLOC_REGS: &[Reg] = &[
    X0_REG,
    X1_REG,
    X2_REG,
    X3_REG,
    X4_REG,
    X5_REG,
    X11_REG,
    X12_REG,
];

/// Special scratch registers for intermediate processing. They should be used only by
/// [`Assembler::arm64_scratch_split`] or [`Assembler::new_with_scratch_reg`].
const SCRATCH0_OPND: Opnd = Opnd::Reg(X15_REG);
const SCRATCH1_OPND: Opnd = Opnd::Reg(X17_REG);
const SCRATCH2_OPND: Opnd = Opnd::Reg(X14_REG);

impl Assembler {
    /// Special register for intermediate processing in arm64_emit. It should be used only by arm64_emit.
    const EMIT_REG: Reg = X16_REG;
    const EMIT_OPND: A64Opnd = A64Opnd::Reg(Self::EMIT_REG);

    /// Return an Assembler with scratch registers disabled in the backend, and a scratch register.
    pub fn new_with_scratch_reg() -> (Self, Opnd) {
        (Self::new_with_accept_scratch_reg(true), SCRATCH0_OPND)
    }

    /// Return true if opnd contains a scratch reg
    pub fn has_scratch_reg(opnd: Opnd) -> bool {
        Self::has_reg(opnd, SCRATCH0_OPND.unwrap_reg())
    }

    /// Get the list of registers from which we will allocate on this platform
    pub fn get_alloc_regs() -> Vec<Reg> {
        ALLOC_REGS.to_vec()
    }

    /// Get a list of all of the caller-saved registers
    pub fn get_caller_save_regs() -> Vec<Reg> {
        vec![X1_REG, X9_REG, X10_REG, X11_REG, X12_REG, X13_REG, X14_REG, X15_REG]
    }

    /// How many bytes a call and a [Self::frame_setup] would change native SP
    pub fn frame_size() -> i32 {
        0x10
    }

    /// Split platform-specific instructions
    /// The transformations done here are meant to make our lives simpler in later
    /// stages of the compilation pipeline.
    /// Here we may want to make sure that all instructions (except load and store)
    /// have no memory operands.
    fn arm64_split(mut self) -> Assembler
    {
        /// When you're storing a register into a memory location or loading a
        /// memory location into a register, the displacement from the base
        /// register of the memory location must fit into 9 bits. If it doesn't,
        /// then we need to load that memory address into a register first.
        fn split_memory_address(asm: &mut Assembler, opnd: Opnd) -> Opnd {
            match opnd {
                Opnd::Mem(mem) => {
                    if mem_disp_fits_bits(mem.disp) {
                        opnd
                    } else {
                        let base = asm.lea(Opnd::Mem(Mem { num_bits: 64, ..mem }));
                        Opnd::mem(mem.num_bits, base, 0)
                    }
                },
                _ => unreachable!("Can only split memory addresses.")
            }
        }

        /// Any memory operands you're sending into an Op::Load instruction need
        /// to be split in case their displacement doesn't fit into 9 bits.
        fn split_load_operand(asm: &mut Assembler, opnd: Opnd) -> Opnd {
            match opnd {
                Opnd::Reg(_) | Opnd::VReg { .. } => opnd,
                Opnd::Mem(_) => {
                    let split_opnd = split_memory_address(asm, opnd);
                    let out_opnd = asm.load(split_opnd);
                    // Many Arm insns support only 32-bit or 64-bit operands. asm.load with fewer
                    // bits zero-extends the value, so it's safe to recognize it as a 32-bit value.
                    if out_opnd.rm_num_bits() < 32 {
                        out_opnd.with_num_bits(32)
                    } else {
                        out_opnd
                    }
                },
                _ => asm.load(opnd)
            }
        }

        /// Operands that take the place of bitmask immediates must follow a
        /// certain encoding. In this function we ensure that those operands
        /// do follow that encoding, and if they don't then we load them first.
        fn split_bitmask_immediate(asm: &mut Assembler, opnd: Opnd, dest_num_bits: u8) -> Opnd {
            match opnd {
                Opnd::Reg(_) | Opnd::VReg { .. } => opnd,
                Opnd::Mem(_) => split_load_operand(asm, opnd),
                Opnd::Imm(imm) => {
                    if imm == 0 {
                        Opnd::Reg(XZR_REG)
                    } else if (dest_num_bits == 64 &&
                                BitmaskImmediate::try_from(imm as u64).is_ok()) ||
                            (dest_num_bits == 32 &&
                                u32::try_from(imm).is_ok() &&
                                BitmaskImmediate::new_32b_reg(imm as u32).is_ok()) {
                        Opnd::UImm(imm as u64)
                    } else {
                        asm.load(opnd).with_num_bits(dest_num_bits)
                    }
                },
                Opnd::UImm(uimm) => {
                    if (dest_num_bits == 64 && BitmaskImmediate::try_from(uimm).is_ok()) ||
                        (dest_num_bits == 32 &&
                            u32::try_from(uimm).is_ok() &&
                            BitmaskImmediate::new_32b_reg(uimm as u32).is_ok()) {
                        opnd
                    } else {
                        asm.load(opnd).with_num_bits(dest_num_bits)
                    }
                },
                Opnd::None | Opnd::Value(_) => unreachable!()
            }
        }

        /// Operands that take the place of a shifted immediate must fit within
        /// a certain size. If they don't then we need to load them first.
        fn split_shifted_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
            match opnd {
                Opnd::Reg(_) | Opnd::VReg { .. } => opnd,
                Opnd::Mem(_) => split_load_operand(asm, opnd),
                Opnd::Imm(imm) => if ShiftedImmediate::try_from(imm as u64).is_ok() {
                    opnd
                } else {
                    asm.load(opnd)
                }
                Opnd::UImm(uimm) => {
                    if ShiftedImmediate::try_from(uimm).is_ok() {
                        opnd
                    } else {
                        asm.load(opnd)
                    }
                },
                Opnd::None | Opnd::Value(_) => unreachable!()
            }
        }

        /// Returns the operands that should be used for a boolean logic
        /// instruction.
        fn split_boolean_operands(asm: &mut Assembler, opnd0: Opnd, opnd1: Opnd) -> (Opnd, Opnd) {
            match (opnd0, opnd1) {
                (Opnd::Reg(_), Opnd::Reg(_)) => {
                    (opnd0, opnd1)
                },
                (reg_opnd @ Opnd::Reg(_), other_opnd) |
                (other_opnd, reg_opnd @ Opnd::Reg(_)) => {
                    let opnd1 = split_bitmask_immediate(asm, other_opnd, reg_opnd.rm_num_bits());
                    (reg_opnd, opnd1)
                },
                _ => {
                    let opnd0 = split_load_operand(asm, opnd0);
                    let opnd1 = split_bitmask_immediate(asm, opnd1, opnd0.rm_num_bits());
                    (opnd0, opnd1)
                }
            }
        }

        /// Returns the operands that should be used for a csel instruction.
        fn split_csel_operands(asm: &mut Assembler, opnd0: Opnd, opnd1: Opnd) -> (Opnd, Opnd) {
            let opnd0 = match opnd0 {
                Opnd::Reg(_) | Opnd::VReg { .. } => opnd0,
                _ => split_load_operand(asm, opnd0)
            };

            let opnd1 = match opnd1 {
                Opnd::Reg(_) | Opnd::VReg { .. } => opnd1,
                _ => split_load_operand(asm, opnd1)
            };

            (opnd0, opnd1)
        }

        fn split_less_than_32_cmp(asm: &mut Assembler, opnd0: Opnd) -> Opnd {
            match opnd0 {
                Opnd::Reg(_) | Opnd::VReg { .. } => {
                    match opnd0.rm_num_bits() {
                        8 => asm.and(opnd0.with_num_bits(64), Opnd::UImm(0xff)),
                        16 => asm.and(opnd0.with_num_bits(64), Opnd::UImm(0xffff)),
                        32 | 64 => opnd0,
                        bits => unreachable!("Invalid number of bits. {}", bits)
                    }
                }
                _ => opnd0
            }
        }

        let mut asm_local = Assembler::new_with_asm(&self);
        let live_ranges: Vec<LiveRange> = take(&mut self.live_ranges);
        let mut iterator = self.instruction_iterator();
        let asm = &mut asm_local;

        while let Some((index, mut insn)) = iterator.next(asm) {
            // Here we're going to map the operands of the instruction to load
            // any Opnd::Value operands into registers if they are heap objects
            // such that only the Op::Load instruction needs to handle that
            // case. If the values aren't heap objects then we'll treat them as
            // if they were just unsigned integer.
            let is_load = matches!(insn, Insn::Load { .. } | Insn::LoadInto { .. });
            let mut opnd_iter = insn.opnd_iter_mut();

            while let Some(opnd) = opnd_iter.next() {
                if let Opnd::Value(value) = opnd {
                    if value.special_const_p() {
                        *opnd = Opnd::UImm(value.as_u64());
                    } else if !is_load {
                        *opnd = asm.load(*opnd);
                    }
                };
            }

            // We are replacing instructions here so we know they are already
            // being used. It is okay not to use their output here.
            #[allow(unused_must_use)]
            match &mut insn {
                Insn::Add { left, right, out } => {
                    match (*left, *right) {
                        // When one operand is a register, legalize the other operand
                        // into possibly an immdiate and swap the order if necessary.
                        // Only the rhs of ADD can be an immediate, but addition is commutative.
                        (reg_opnd @ (Opnd::Reg(_) | Opnd::VReg { .. }), other_opnd) |
                        (other_opnd, reg_opnd @ (Opnd::Reg(_) | Opnd::VReg { .. })) => {
                            *left = reg_opnd;
                            *right = split_shifted_immediate(asm, other_opnd);
                            // Now `right` is either a register or an immediate, both can try to
                            // merge with a subsequent mov.
                            merge_three_reg_mov(&live_ranges, &mut iterator, asm, left, left, out);
                            asm.push_insn(insn);
                        }
                        _ => {
                            *left = split_load_operand(asm, *left);
                            *right = split_shifted_immediate(asm, *right);
                            merge_three_reg_mov(&live_ranges, &mut iterator, asm, left, right, out);
                            asm.push_insn(insn);
                        }
                    }
                }
                Insn::Sub { left, right, out } => {
                    *left = split_load_operand(asm, *left);
                    *right = split_shifted_immediate(asm, *right);
                    // Now `right` is either a register or an immediate,
                    // both can try to merge with a subsequent mov.
                    merge_three_reg_mov(&live_ranges, &mut iterator, asm, left, left, out);
                    asm.push_insn(insn);
                }
                Insn::And { left, right, out } |
                Insn::Or { left, right, out } |
                Insn::Xor { left, right, out } => {
                    let (opnd0, opnd1) = split_boolean_operands(asm, *left, *right);
                    *left = opnd0;
                    *right = opnd1;

                    merge_three_reg_mov(&live_ranges, &mut iterator, asm, left, right, out);

                    asm.push_insn(insn);
                }
                /*
                // Lower to Joz and Jonz for generating CBZ/CBNZ for compare-with-0-and-branch.
                ref insn @ Insn::Cmp { ref left, right: ref right @ (Opnd::UImm(0) | Opnd::Imm(0)) } |
                ref insn @ Insn::Test { ref left, right: ref right @ (Opnd::InsnOut { .. } | Opnd::Reg(_)) } if {
                    let same_opnd_if_test = if let Insn::Test { .. } = insn {
                        left == right
                    } else {
                        true
                    };

                    same_opnd_if_test && if let Some(
                            Insn::Jz(target) | Insn::Je(target) | Insn::Jnz(target) | Insn::Jne(target)
                        ) = iterator.peek() {
                            matches!(target, Target::SideExit { .. })
                        } else {
                            false
                        }
                } => {
                    let reg = split_load_operand(asm, *left);
                    match iterator.peek() {
                        Some(Insn::Jz(target) | Insn::Je(target))   => asm.push_insn(Insn::Joz(reg, *target)),
                        Some(Insn::Jnz(target) | Insn::Jne(target)) => asm.push_insn(Insn::Jonz(reg, *target)),
                        _ => ()
                    }

                    iterator.map_insn_index(asm);
                    iterator.next_unmapped(); // Pop merged jump instruction
                }
                */
                Insn::CCall { opnds, .. } => {
                    assert!(opnds.len() <= C_ARG_OPNDS.len());

                    // Load each operand into the corresponding argument
                    // register.
                    // Note: the iteration order is reversed to avoid corrupting x0,
                    // which is both the return value and first argument register
                    if !opnds.is_empty() {
                        let mut args: Vec<(Opnd, Opnd)> = vec![];
                        for (idx, opnd) in opnds.iter_mut().enumerate().rev() {
                            // If the value that we're sending is 0, then we can use
                            // the zero register, so in this case we'll just send
                            // a UImm of 0 along as the argument to the move.
                            let value = match opnd {
                                Opnd::UImm(0) | Opnd::Imm(0) => Opnd::UImm(0),
                                Opnd::Mem(_) => split_memory_address(asm, *opnd),
                                _ => *opnd
                            };
                            args.push((C_ARG_OPNDS[idx], value));
                        }
                        asm.parallel_mov(args);
                    }

                    // Now we push the CCall without any arguments so that it
                    // just performs the call.
                    *opnds = vec![];
                    asm.push_insn(insn);
                },
                Insn::Cmp { left, right } => {
                    let opnd0 = split_load_operand(asm, *left);
                    let opnd0 = split_less_than_32_cmp(asm, opnd0);
                    let split_right = split_shifted_immediate(asm, *right);
                    let opnd1 = match split_right {
                        Opnd::VReg { .. } if opnd0.num_bits() != split_right.num_bits() => {
                            split_right.with_num_bits(opnd0.num_bits().unwrap())
                        },
                        _ => split_right
                    };

                    asm.cmp(opnd0, opnd1);
                },
                Insn::CRet(opnd) => {
                    match opnd {
                        // If the value is already in the return register, then
                        // we don't need to do anything.
                        Opnd::Reg(C_RET_REG) => {},

                        // If the value is a memory address, we need to first
                        // make sure the displacement isn't too large and then
                        // load it into the return register.
                        Opnd::Mem(_) => {
                            let split = split_memory_address(asm, *opnd);
                            asm.load_into(C_RET_OPND, split);
                        },

                        // Otherwise we just need to load the value into the
                        // return register.
                        _ => {
                            asm.load_into(C_RET_OPND, *opnd);
                        }
                    }
                    asm.cret(C_RET_OPND);
                },
                Insn::CSelZ { truthy, falsy, out } |
                Insn::CSelNZ { truthy, falsy, out } |
                Insn::CSelE { truthy, falsy, out } |
                Insn::CSelNE { truthy, falsy, out } |
                Insn::CSelL { truthy, falsy, out } |
                Insn::CSelLE { truthy, falsy, out } |
                Insn::CSelG { truthy, falsy, out } |
                Insn::CSelGE { truthy, falsy, out } => {
                    let (opnd0, opnd1) = split_csel_operands(asm, *truthy, *falsy);
                    *truthy = opnd0;
                    *falsy = opnd1;
                    // Merge `csel` and `mov` into a single `csel` when possible
                    match iterator.peek().map(|(_, insn)| insn) {
                        Some(Insn::Mov { dest: Opnd::Reg(reg), src })
                        if matches!(out, Opnd::VReg { .. }) && *out == *src && live_ranges[out.vreg_idx()].end() == index + 1 => {
                            *out = Opnd::Reg(*reg);
                            asm.push_insn(insn);
                            iterator.next(asm); // Pop merged Insn::Mov
                        }
                        _ => {
                            asm.push_insn(insn);
                        }
                    }
                },
                Insn::JmpOpnd(opnd) => {
                    if let Opnd::Mem(_) = opnd {
                        let opnd0 = split_load_operand(asm, *opnd);
                        asm.jmp_opnd(opnd0);
                    } else {
                        asm.jmp_opnd(*opnd);
                    }
                },
                Insn::Load { opnd, .. } |
                Insn::LoadInto { opnd, .. } => {
                    *opnd = match opnd {
                        Opnd::Mem(_) => split_memory_address(asm, *opnd),
                        _ => *opnd
                    };
                    asm.push_insn(insn);
                },
                Insn::LoadSExt { opnd, out } => {
                    match opnd {
                        // We only want to sign extend if the operand is a
                        // register, instruction output, or memory address that
                        // is 32 bits. Otherwise we'll just load the value
                        // directly since there's no need to sign extend.
                        Opnd::Reg(Reg { num_bits: 32, .. }) |
                        Opnd::VReg { num_bits: 32, .. } |
                        Opnd::Mem(Mem { num_bits: 32, .. }) => {
                            asm.push_insn(insn);
                        },
                        _ => {
                            asm.push_insn(Insn::Load { opnd: *opnd, out: *out });
                        }
                    };
                },
                Insn::Mov { dest, src } => {
                    match (&dest, &src) {
                        // If we're attempting to load into a memory operand, then
                        // we'll switch over to the store instruction.
                        (Opnd::Mem(_), _) => {
                            let opnd0 = split_memory_address(asm, *dest);
                            let value = match *src {
                                // If the first operand is zero, then we can just use
                                // the zero register.
                                Opnd::UImm(0) | Opnd::Imm(0) => Opnd::Reg(XZR_REG),
                                // If the first operand is a memory operand, we're going
                                // to transform this into a store instruction, so we'll
                                // need to load this anyway.
                                Opnd::UImm(_) => asm.load(*src),
                                // The value that is being moved must be either a
                                // register or an immediate that can be encoded as a
                                // bitmask immediate. Otherwise, we'll need to split the
                                // move into multiple instructions.
                                _ => split_bitmask_immediate(asm, *src, dest.rm_num_bits())
                            };

                            asm.store(opnd0, value);
                        },
                        // If we're loading a memory operand into a register, then
                        // we'll switch over to the load instruction.
                        (Opnd::Reg(_) | Opnd::VReg { .. }, Opnd::Mem(_)) => {
                            let value = split_memory_address(asm, *src);
                            asm.load_into(*dest, value);
                        },
                        // Otherwise we'll use the normal mov instruction.
                        (Opnd::Reg(_), _) => {
                            let value = match *src {
                                // Unlike other instructions, we can avoid splitting this case, using movz.
                                Opnd::UImm(uimm) if uimm <= 0xffff => *src,
                                _ => split_bitmask_immediate(asm, *src, dest.rm_num_bits()),
                            };
                            asm.mov(*dest, value);
                        },
                        _ => unreachable!("unexpected combination of operands in Insn::Mov: {dest:?}, {src:?}")
                    };
                },
                Insn::Not { opnd, .. } => {
                    // The value that is being negated must be in a register, so
                    // if we get anything else we need to load it first.
                    *opnd = match opnd {
                        Opnd::Mem(_) => split_load_operand(asm, *opnd),
                        _ => *opnd
                    };
                    asm.push_insn(insn);
                },
                Insn::LShift { opnd, .. } |
                Insn::RShift { opnd, .. } |
                Insn::URShift { opnd, .. } => {
                    // The operand must be in a register, so
                    // if we get anything else we need to load it first.
                    *opnd = split_load_operand(asm, *opnd);
                    asm.push_insn(insn);
                },
                Insn::Mul { left, right, .. } => {
                    *left = split_load_operand(asm, *left);
                    *right = split_load_operand(asm, *right);
                    asm.push_insn(insn);
                },
                Insn::Test { left, right } => {
                    // The value being tested must be in a register, so if it's
                    // not already one we'll load it first.
                    let opnd0 = split_load_operand(asm, *left);

                    // The second value must be either a register or an
                    // unsigned immediate that can be encoded as a bitmask
                    // immediate. If it's not one of those, we'll need to load
                    // it first.
                    let opnd1 = split_bitmask_immediate(asm, *right, opnd0.rm_num_bits());
                    asm.test(opnd0, opnd1);
                },
                _ => {
                    asm.push_insn(insn);
                }
            }
        }

        asm_local
    }

    /// Split instructions using scratch registers. To maximize the use of the register pool for
    /// VRegs, most splits should happen in [`Self::arm64_split`]. However, some instructions
    /// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so this
    /// splits them and uses scratch registers for it.
    fn arm64_scratch_split(mut self) -> Assembler {
        /// If opnd is Opnd::Mem with a too large disp, make the disp smaller using lea.
        fn split_large_disp(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
            match opnd {
                Opnd::Mem(Mem { num_bits, disp, .. }) if !mem_disp_fits_bits(disp) => {
                    asm.lea_into(scratch_opnd, opnd);
                    Opnd::mem(num_bits, scratch_opnd, 0)
                }
                _ => opnd,
            }
        }

        /// If opnd is Opnd::Mem with MemBase::Stack, lower it to Opnd::Mem with MemBase::Reg, and split a large disp.
        fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
            let opnd = split_only_stack_membase(asm, opnd, scratch_opnd, stack_state);
            split_large_disp(asm, opnd, scratch_opnd)
        }

        /// split_stack_membase but without split_large_disp. This should be used only by lea.
        fn split_only_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
            if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp: opnd_disp, num_bits: opnd_num_bits }) = opnd {
                let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase));
                let base = split_large_disp(asm, base, scratch_opnd);
                asm.load_into(scratch_opnd, base);
                Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp: opnd_disp, num_bits: opnd_num_bits })
            } else {
                opnd
            }
        }

        /// If opnd is Opnd::Mem, lower it to scratch_opnd. You should use this when `opnd` is read by the instruction, not written.
        fn split_memory_read(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
            if let Opnd::Mem(_) = opnd {
                let opnd = split_large_disp(asm, opnd, scratch_opnd);
                let scratch_opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
                asm.load_into(scratch_opnd, opnd);
                scratch_opnd
            } else {
                opnd
            }
        }

        /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg.
        fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> {
            if let Opnd::Mem(_) = opnd {
                let mem_opnd = opnd.clone();
                *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
                Some(mem_opnd)
            } else {
                None
            }
        }

        // Prepare StackState to lower MemBase::Stack
        let stack_state = StackState::new(self.stack_base_idx);

        let mut asm_local = Assembler::new_with_asm(&self);
        let asm = &mut asm_local;
        asm.accept_scratch_reg = true;
        let iterator = &mut self.instruction_iterator();

        while let Some((_, mut insn)) = iterator.next(asm) {
            match &mut insn {
                Insn::Add { left, right, out } |
                Insn::Sub { left, right, out } |
                Insn::And { left, right, out } |
                Insn::Or { left, right, out } |
                Insn::Xor { left, right, out } |
                Insn::CSelZ  { truthy: left, falsy: right, out } |
                Insn::CSelNZ { truthy: left, falsy: right, out } |
                Insn::CSelE  { truthy: left, falsy: right, out } |
                Insn::CSelNE { truthy: left, falsy: right, out } |
                Insn::CSelL  { truthy: left, falsy: right, out } |
                Insn::CSelLE { truthy: left, falsy: right, out } |
                Insn::CSelG  { truthy: left, falsy: right, out } |
                Insn::CSelGE { truthy: left, falsy: right, out } => {
                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
                    let mem_out = split_memory_write(out, SCRATCH0_OPND);

                    asm.push_insn(insn);

                    if let Some(mem_out) = mem_out {
                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
                        asm.store(mem_out, SCRATCH0_OPND);
                    }
                }
                Insn::Mul { left, right, out } => {
                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
                    let reg_out = out.clone();

                    asm.push_insn(insn);

                    if let Some(mem_out) = mem_out {
                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
                        asm.store(mem_out, SCRATCH0_OPND);
                    };

                    // If the next instruction is JoMul
                    if matches!(iterator.peek(), Some((_, Insn::JoMul(_)))) {
                        // Produce a register that is all zeros or all ones
                        // Based on the sign bit of the 64-bit mul result
                        asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) });
                    }
                }
                Insn::LShift { opnd, out, .. } |
                Insn::RShift { opnd, out, .. } => {
                    *opnd = split_memory_read(asm, *opnd, SCRATCH0_OPND);
                    let mem_out = split_memory_write(out, SCRATCH0_OPND);

                    asm.push_insn(insn);

                    if let Some(mem_out) = mem_out {
                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
                        asm.store(mem_out, SCRATCH0_OPND);
                    }
                }
                Insn::Cmp { left, right } |
                Insn::Test { left, right } => {
                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
                    asm.push_insn(insn);
                }
                // For compile_exits, support splitting simple C arguments here
                Insn::CCall { opnds, .. } if !opnds.is_empty() => {
                    for (i, opnd) in opnds.iter().enumerate() {
                        asm.load_into(C_ARG_OPNDS[i], *opnd);
                    }
                    *opnds = vec![];
                    asm.push_insn(insn);
                }
                // For compile_exits, support splitting simple return values here
                Insn::CRet(opnd) => {
                    match opnd {
                        Opnd::Reg(C_RET_REG) => {},
                        _ => asm.load_into(C_RET_OPND, *opnd),
                    }
                    asm.cret(C_RET_OPND);
                }
                Insn::Lea { opnd, out } => {
                    *opnd = split_only_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
                    let mem_out = split_memory_write(out, SCRATCH0_OPND);

                    asm.push_insn(insn);

                    if let Some(mem_out) = mem_out {
                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
                        asm.store(mem_out, SCRATCH0_OPND);
                    }
                }
                Insn::Load { opnd, out } |
                Insn::LoadInto { opnd, dest: out } => {
                    *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
                    *out = split_stack_membase(asm, *out, SCRATCH1_OPND, &stack_state);

                    if let Opnd::Mem(_) = out {
                        // If NATIVE_STACK_PTR is used as a source for Store, it's handled as xzr, storeing zero.
                        // To save the content of NATIVE_STACK_PTR, we need to load it into another register first.
                        if *opnd == NATIVE_STACK_PTR {
                            asm.load_into(SCRATCH0_OPND, NATIVE_STACK_PTR);
                            *opnd = SCRATCH0_OPND;
                        }
                        asm.store(*out, *opnd);
                    } else {
                        asm.push_insn(insn);
                    }
                }
                &mut Insn::IncrCounter { mem, value } => {
                    // Convert Opnd::const_ptr into Opnd::Mem.
                    // It's split here to support IncrCounter in compile_exits.
                    assert!(matches!(mem, Opnd::UImm(_)));
                    asm.load_into(SCRATCH0_OPND, mem);
                    asm.lea_into(SCRATCH0_OPND, Opnd::mem(64, SCRATCH0_OPND, 0));

                    // Create a local loop to atomically increment a counter using SCRATCH1_OPND to check if it succeeded.
                    // Note that arm64_emit will peek at the next Cmp to set a status into SCRATCH1_OPND on IncrCounter.
                    let label = asm.new_label("incr_counter_loop");
                    asm.write_label(label.clone());
                    asm.incr_counter(SCRATCH0_OPND, value);
                    asm.cmp(SCRATCH1_OPND, 0.into());
                    asm.jne(label);
                }
                Insn::Store { dest, .. } => {
                    *dest = split_stack_membase(asm, *dest, SCRATCH0_OPND, &stack_state);
                    asm.push_insn(insn);
                }
                Insn::Mov { dest, src } => {
                    *src = split_stack_membase(asm, *src, SCRATCH0_OPND, &stack_state);
                    *dest = split_large_disp(asm, *dest, SCRATCH1_OPND);
                    match dest {
                        Opnd::Reg(_) => asm.load_into(*dest, *src),
                        Opnd::Mem(_) => asm.store(*dest, *src),
                        _ => asm.push_insn(insn),
                    }
                }
                // Resolve ParallelMov that couldn't be handled without a scratch register.
                Insn::ParallelMov { moves } => {
                    for (dst, src) in Self::resolve_parallel_moves(moves, Some(SCRATCH0_OPND)).unwrap() {
                        let src = split_stack_membase(asm, src, SCRATCH1_OPND, &stack_state);
                        let dst = split_large_disp(asm, dst, SCRATCH2_OPND);
                        match dst {
                            Opnd::Reg(_) => asm.load_into(dst, src),
                            Opnd::Mem(_) => asm.store(dst, src),
                            _ => asm.mov(dst, src),
                        }
                    }
                }
                &mut Insn::PatchPoint { ref target, invariant, payload } => {
                    split_patch_point(asm, target, invariant, payload);
                }
                _ => {
                    asm.push_insn(insn);
                }
            }
        }

        asm_local
    }

    /// Emit platform-specific machine code
    /// Returns a list of GC offsets. Can return failure to signal caller to retry.
    fn arm64_emit(&mut self, cb: &mut CodeBlock) -> Option<Vec<CodePtr>> {
        /// Determine how many instructions it will take to represent moving
        /// this value into a register. Note that the return value of this
        /// function must correspond to how many instructions are used to
        /// represent this load in the emit_load_value function.
        fn emit_load_size(value: u64) -> u8 {
            if BitmaskImmediate::try_from(value).is_ok() {
                return 1;
            }

            if value < (1 << 16) {
                1
            } else if value < (1 << 32) {
                2
            } else if value < (1 << 48) {
                3
            } else {
                4
            }
        }

        /// Emit a conditional jump instruction to a specific target. This is
        /// called when lowering any of the conditional jump instructions.
        fn emit_conditional_jump<const CONDITION: u8>(cb: &mut CodeBlock, target: Target) {
            fn generate_branch<const CONDITION: u8>(cb: &mut CodeBlock, src_addr: i64, dst_addr: i64) {
                let num_insns = if bcond_offset_fits_bits((dst_addr - src_addr) / 4) {
                    // If the jump offset fits into the conditional jump as
                    // an immediate value and it's properly aligned, then we
                    // can use the b.cond instruction directly. We're safe
                    // to use as i32 here since we already checked that it
                    // fits.
                    let bytes = (dst_addr - src_addr) as i32;
                    bcond(cb, CONDITION, InstructionOffset::from_bytes(bytes));

                    // Here we're going to return 1 because we've only
                    // written out 1 instruction.
                    1
                } else if b_offset_fits_bits((dst_addr - (src_addr + 4)) / 4) { // + 4 for bcond
                    // If the jump offset fits into the unconditional jump as
                    // an immediate value, we can use inverse b.cond + b.
                    //
                    // We're going to write out the inverse condition so
                    // that if it doesn't match it will skip over the
                    // instruction used for branching.
                    bcond(cb, Condition::inverse(CONDITION), 2.into());
                    b(cb, InstructionOffset::from_bytes((dst_addr - (src_addr + 4)) as i32)); // + 4 for bcond

                    // We've only written out 2 instructions.
                    2
                } else {
                    // Otherwise, we need to load the address into a
                    // register and use the branch register instruction.
                    let load_insns: i32 = emit_load_size(dst_addr as u64).into();

                    // We're going to write out the inverse condition so
                    // that if it doesn't match it will skip over the
                    // instructions used for branching.
                    bcond(cb, Condition::inverse(CONDITION), (load_insns + 2).into());
                    emit_load_value(cb, Assembler::EMIT_OPND, dst_addr as u64);
                    br(cb, Assembler::EMIT_OPND);

                    // Here we'll return the number of instructions that it
                    // took to write out the destination address + 1 for the
                    // b.cond and 1 for the br.
                    load_insns + 2
                };

                // We need to make sure we have at least 6 instructions for
                // every kind of jump for invalidation purposes, so we're
                // going to write out padding nop instructions here.
                assert!(num_insns <= cb.conditional_jump_insns());
                (num_insns..cb.conditional_jump_insns()).for_each(|_| nop(cb));
            }

            match target {
                Target::CodePtr(dst_ptr) => {
                    let dst_addr = dst_ptr.as_offset();
                    let src_addr = cb.get_write_ptr().as_offset();
                    generate_branch::<CONDITION>(cb, src_addr, dst_addr);
                },
                Target::Label(label_idx) => {
                    // We save `cb.conditional_jump_insns` number of bytes since we may use up to that amount
                    // `generate_branch` will pad the emitted branch instructions with `nop`s for each unused byte.
                    cb.label_ref(label_idx, (cb.conditional_jump_insns() * 4) as usize, |cb, src_addr, dst_addr| {
                        generate_branch::<CONDITION>(cb, src_addr - (cb.conditional_jump_insns() * 4) as i64, dst_addr);
                    });
                },
                Target::SideExit { .. } => {
                    unreachable!("Target::SideExit should have been compiled by compile_exits")
                },
            };
        }

        /// Emit a CBZ or CBNZ which branches when a register is zero or non-zero
        fn emit_cmp_zero_jump(cb: &mut CodeBlock, reg: A64Opnd, branch_if_zero: bool, target: Target) {
            if let Target::CodePtr(dst_ptr) = target {
                let dst_addr = dst_ptr.as_offset();
                let src_addr = cb.get_write_ptr().as_offset();

                if bcond_offset_fits_bits((dst_addr - src_addr) / 4) {
                    // If the offset fits in one instruction, generate cbz or cbnz
                    let bytes = (dst_addr - src_addr) as i32;
                    if branch_if_zero {
                        cbz(cb, reg, InstructionOffset::from_bytes(bytes));
                    } else {
                        cbnz(cb, reg, InstructionOffset::from_bytes(bytes));
                    }
                } else {
                    // Otherwise, we load the address into a register and
                    // use the branch register instruction. Note that because
                    // side exits should always be close, this form should be
                    // rare or impossible to see.
                    let dst_addr = dst_ptr.raw_addr(cb) as u64;
                    let load_insns: i32 = emit_load_size(dst_addr).into();

                    // Write out the inverse condition so that if
                    // it doesn't match it will skip over the
                    // instructions used for branching.
                    if branch_if_zero {
                        cbnz(cb, reg, InstructionOffset::from_insns(load_insns + 2));
                    } else {
                        cbz(cb, reg, InstructionOffset::from_insns(load_insns + 2));
                    }
                    emit_load_value(cb, Assembler::EMIT_OPND, dst_addr);
                    br(cb, Assembler::EMIT_OPND);
                }
            } else {
                unreachable!("We should only generate Joz/Jonz with side-exit targets");
            }
        }

        /// Do the address calculation of `out_reg = base_reg + disp`
        fn load_effective_address(cb: &mut CodeBlock, out: A64Opnd, base_reg_no: u8, disp: i32) {
            let base_reg = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: base_reg_no });
            let out_reg_no = out.unwrap_reg().reg_no;
            assert_ne!(31, out_reg_no, "Lea sp, [sp, #imm] not always encodable. Use add/sub instead.");
            assert_ne!(base_reg_no, out_reg_no, "large displacement need a scratch register");

            if ShiftedImmediate::try_from(disp.unsigned_abs() as u64).is_ok() {
                // Use ADD/SUB if the displacement fits
                add(cb, out, base_reg, A64Opnd::new_imm(disp.into()));
            } else {
                // Use add_extended() to interpret reg_no=31 as sp
                // since the base register is never the zero register.
                // Careful! Only the first two operands can refer to sp.
                emit_load_value(cb, out, disp as u64);
                add_extended(cb, out, base_reg, out);
            };
        }

        /// Load a VALUE to a register and remember it for GC marking and reference updating
        fn emit_load_gc_value(cb: &mut CodeBlock, gc_offsets: &mut Vec<CodePtr>, dest: A64Opnd, value: VALUE) {
            // We dont need to check if it's a special const
            // here because we only allow these operands to hit
            // this point if they're not a special const.
            assert!(!value.special_const_p());

            // This assumes only load instructions can contain
            // references to GC'd Value operands. If the value
            // being loaded is a heap object, we'll report that
            // back out to the gc_offsets list.
            ldr_literal(cb, dest, 2.into());
            b(cb, InstructionOffset::from_bytes(4 + (SIZEOF_VALUE as i32)));
            cb.write_bytes(&value.as_u64().to_le_bytes());

            let ptr_offset = cb.get_write_ptr().sub_bytes(SIZEOF_VALUE);
            gc_offsets.push(ptr_offset);
        }

        /// Emit a push instruction for the given operand by adding to the stack
        /// pointer and then storing the given value.
        fn emit_push(cb: &mut CodeBlock, opnd: A64Opnd) {
            str_pre(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, -C_SP_STEP));
        }

        /// Emit a pop instruction into the given operand by loading the value
        /// and then subtracting from the stack pointer.
        fn emit_pop(cb: &mut CodeBlock, opnd: A64Opnd) {
            ldr_post(cb, opnd, A64Opnd::new_mem(64, C_SP_REG, C_SP_STEP));
        }

        // List of GC offsets
        let mut gc_offsets: Vec<CodePtr> = Vec::new();

        // Buffered list of PosMarker callbacks to fire if codegen is successful
        let mut pos_markers: Vec<(usize, CodePtr)> = vec![];

        // The write_pos for the last Insn::PatchPoint, if any
        let mut last_patch_pos: Option<usize> = None;

        // Install a panic hook to dump Assembler with insn_idx on dev builds
        let (_hook, mut hook_insn_idx) = AssemblerPanicHook::new(self, 0);

        // For each instruction
        let mut insn_idx: usize = 0;
        while let Some(insn) = self.insns.get(insn_idx) {
            // Update insn_idx that is shown on panic
            hook_insn_idx.as_mut().map(|idx| idx.lock().map(|mut idx| *idx = insn_idx).unwrap());

            match insn {
                Insn::Comment(text) => {
                    cb.add_comment(text);
                },
                Insn::Label(target) => {
                    cb.write_label(target.unwrap_label());
                },
                // Report back the current position in the generated code
                Insn::PosMarker(..) => {
                    pos_markers.push((insn_idx, cb.get_write_ptr()))
                }
                Insn::BakeString(text) => {
                    for byte in text.as_bytes() {
                        cb.write_byte(*byte);
                    }

                    // Add a null-terminator byte for safety (in case we pass
                    // this to C code)
                    cb.write_byte(0);

                    // Pad out the string to the next 4-byte boundary so that
                    // it's easy to jump past.
                    for _ in 0..(4 - ((text.len() + 1) % 4)) {
                        cb.write_byte(0);
                    }
                },
                &Insn::FrameSetup { preserved, mut slot_count } => {
                    const { assert!(SIZEOF_VALUE == 8, "alignment logic relies on SIZEOF_VALUE == 8"); }
                    // Preserve X29 and set up frame record
                    stp_pre(cb, X29, X30, A64Opnd::new_mem(128, C_SP_REG, -16));
                    mov(cb, X29, C_SP_REG);

                    for regs in preserved.chunks(2) {
                        // For the body, store pairs and move SP
                        if let [reg0, reg1] = regs {
                            stp_pre(cb, reg1.into(), reg0.into(), A64Opnd::new_mem(128, C_SP_REG, -16));
                        } else if let [reg] = regs {
                            // For overhang, store but don't move SP. Combine movement with
                            // movement for slots below.
                            stur(cb, reg.into(), A64Opnd::new_mem(64, C_SP_REG, -8));
                            slot_count += 1;
                        } else {
                            unreachable!("chunks(2)");
                        }
                    }
                    // Align slot_count
                    if slot_count % 2 == 1 {
                        slot_count += 1
                    }
                    if slot_count > 0 {
                        let slot_offset = (slot_count * SIZEOF_VALUE) as u64;
                        // Bail when asked to reserve too many slots in one instruction.
                        ShiftedImmediate::try_from(slot_offset).ok()?;
                        sub(cb, C_SP_REG, C_SP_REG, A64Opnd::new_uimm(slot_offset));
                    }
                }
                Insn::FrameTeardown { preserved } => {
                    // Restore preserved registers below frame pointer.
                    let mut base_offset = 0;
                    for regs in preserved.chunks(2) {
                        if let [reg0, reg1] = regs {
                            base_offset -= 16;
                            ldp(cb, reg1.into(), reg0.into(), A64Opnd::new_mem(128, X29, base_offset));
                        } else if let [reg] = regs {
                            ldur(cb, reg.into(), A64Opnd::new_mem(64, X29, base_offset - 8));
                        } else {
                            unreachable!("chunks(2)");
                        }
                    }

                    // SP = X29 (frame pointer)
                    mov(cb, C_SP_REG, X29);
                    ldp_post(cb, X29, X30, A64Opnd::new_mem(128, C_SP_REG, 16));
                }
                Insn::Add { left, right, out } => {
                    // Usually, we issue ADDS, so you could branch on overflow, but ADDS with
                    // out=31 refers to out=XZR, which discards the sum. So, instead of ADDS
                    // (aliased to CMN in this case) we issue ADD instead which writes the sum
                    // to the stack pointer; we assume you got x31 from NATIVE_STACK_POINTER.
                    let out: A64Opnd = out.into();
                    if let A64Opnd::Reg(A64Reg { reg_no: 31, .. }) = out {
                        add(cb, out, left.into(), right.into());
                    } else {
                        adds(cb, out, left.into(), right.into());
                    }
                },
                Insn::Sub { left, right, out } => {
                    // Usually, we issue SUBS, so you could branch on overflow, but SUBS with
                    // out=31 refers to out=XZR, which discards the result. So, instead of SUBS
                    // (aliased to CMP in this case) we issue SUB instead which writes the diff
                    // to the stack pointer; we assume you got x31 from NATIVE_STACK_POINTER.
                    let out: A64Opnd = out.into();
                    if let A64Opnd::Reg(A64Reg { reg_no: 31, .. }) = out {
                        sub(cb, out, left.into(), right.into());
                    } else {
                        subs(cb, out, left.into(), right.into());
                    }
                },
                Insn::Mul { left, right, out } => {
                    // If the next instruction is JoMul with RShift created by arm64_scratch_split
                    match (self.insns.get(insn_idx + 1), self.insns.get(insn_idx + 2)) {
                        (Some(Insn::RShift { out: out_sign, opnd: out_opnd, shift: out_shift }), Some(Insn::JoMul(_))) => {
                            // Compute the high 64 bits
                            smulh(cb, Self::EMIT_OPND, left.into(), right.into());

                            // Compute the low 64 bits
                            // This may clobber one of the input registers,
                            // so we do it after smulh
                            mul(cb, out.into(), left.into(), right.into());

                            // Insert the shift instruction created by arm64_scratch_split
                            // to prepare the register that has the sign bit of the high 64 bits after mul.
                            asr(cb, out_sign.into(), out_opnd.into(), out_shift.into());
                            insn_idx += 1; // skip the next Insn::RShift

                            // If the high 64-bits are not all zeros or all ones,
                            // matching the sign bit, then we have an overflow
                            cmp(cb, Self::EMIT_OPND, out_sign.into());
                            // Insn::JoMul will emit_conditional_jump::<{Condition::NE}>
                        }
                        _ => {
                            mul(cb, out.into(), left.into(), right.into());
                        }
                    }
                },
                Insn::And { left, right, out } => {
                    and(cb, out.into(), left.into(), right.into());
                },
                Insn::Or { left, right, out } => {
                    orr(cb, out.into(), left.into(), right.into());
                },
                Insn::Xor { left, right, out } => {
                    eor(cb, out.into(), left.into(), right.into());
                },
                Insn::Not { opnd, out } => {
                    mvn(cb, out.into(), opnd.into());
                },
                Insn::RShift { opnd, shift, out } => {
                    asr(cb, out.into(), opnd.into(), shift.into());
                },
                Insn::URShift { opnd, shift, out } => {
                    lsr(cb, out.into(), opnd.into(), shift.into());
                },
                Insn::LShift { opnd, shift, out } => {
                    lsl(cb, out.into(), opnd.into(), shift.into());
                },
                Insn::Store { dest, src } => {
                    // Split src into EMIT0_OPND if necessary
                    let src_reg: A64Reg = match src {
                        Opnd::Reg(reg) => *reg,
                        // Use zero register when possible
                        Opnd::UImm(0) | Opnd::Imm(0) => XZR_REG,
                        // Immediates
                        &Opnd::Imm(imm) => {
                            emit_load_value(cb, Self::EMIT_OPND, imm as u64);
                            Self::EMIT_REG
                        }
                        &Opnd::UImm(imm) => {
                            emit_load_value(cb, Self::EMIT_OPND, imm);
                            Self::EMIT_REG
                        }
                        &Opnd::Value(value) => {
                            emit_load_gc_value(cb, &mut gc_offsets, Self::EMIT_OPND, value);
                            Self::EMIT_REG
                        }
                        src_mem @ &Opnd::Mem(Mem { num_bits: src_num_bits, base: MemBase::Reg(src_base_reg_no), disp: src_disp }) => {
                            // For mem-to-mem store, load the source into EMIT0_OPND
                            let src_mem = if mem_disp_fits_bits(src_disp) {
                                src_mem.into()
                            } else {
                                // Split the load address into EMIT0_OPND first if necessary
                                load_effective_address(cb, Self::EMIT_OPND, src_base_reg_no, src_disp);
                                A64Opnd::new_mem(dest.rm_num_bits(), Self::EMIT_OPND, 0)
                            };
                            let dst = A64Opnd::Reg(Self::EMIT_REG.with_num_bits(src_num_bits));
                            match src_num_bits {
                                64 | 32 => ldur(cb, dst, src_mem),
                                16 => ldurh(cb, dst, src_mem),
                                8 => ldurb(cb, dst, src_mem),
                                num_bits => panic!("unexpected num_bits: {num_bits}")
                            };
                            Self::EMIT_REG
                        }
                        src @ (Opnd::Mem(_) | Opnd::None | Opnd::VReg { .. }) => panic!("Unexpected source operand during arm64_emit: {src:?}")
                    };
                    let src = A64Opnd::Reg(src_reg);

                    // This order may be surprising but it is correct. The way
                    // the Arm64 assembler works, the register that is going to
                    // be stored is first and the address is second. However in
                    // our IR we have the address first and the register second.
                    match dest.rm_num_bits() {
                        64 | 32 => stur(cb, src, dest.into()),
                        16 => sturh(cb, src, dest.into()),
                        8 => sturb(cb, src, dest.into()),
                        num_bits => panic!("unexpected dest num_bits: {} (src: {:?}, dest: {:?})", num_bits, src, dest),
                    }
                },
                Insn::Load { opnd, out } |
                Insn::LoadInto { opnd, dest: out } => {
                    match *opnd {
                        Opnd::Reg(_) | Opnd::VReg { .. } => {
                            mov(cb, out.into(), opnd.into());
                        },
                        Opnd::UImm(uimm) => {
                            emit_load_value(cb, out.into(), uimm);
                        },
                        Opnd::Imm(imm) => {
                            emit_load_value(cb, out.into(), imm as u64);
                        },
                        Opnd::Mem(_) => {
                            match opnd.rm_num_bits() {
                                64 | 32 => ldur(cb, out.into(), opnd.into()),
                                16 => ldurh(cb, out.into(), opnd.into()),
                                8 => ldurb(cb, out.into(), opnd.into()),
                                num_bits => panic!("unexpected num_bits: {}", num_bits)
                            };
                        },
                        Opnd::Value(value) => {
                            emit_load_gc_value(cb, &mut gc_offsets, out.into(), value);
                        },
                        Opnd::None => {
                            unreachable!("Attempted to load from None operand");
                        }
                    };
                },
                Insn::LoadSExt { opnd, out } => {
                    match *opnd {
                        Opnd::Reg(Reg { num_bits: 32, .. }) |
                        Opnd::VReg { num_bits: 32, .. } => {
                            sxtw(cb, out.into(), opnd.into());
                        },
                        Opnd::Mem(Mem { num_bits: 32, .. }) => {
                            ldursw(cb, out.into(), opnd.into());
                        },
                        _ => unreachable!()
                    };
                },
                Insn::ParallelMov { .. } => unreachable!("{insn:?} should have been lowered at alloc_regs()"),
                Insn::Mov { dest, src } => {
                    // This supports the following two kinds of immediates:
                    //   * The value fits into a single movz instruction
                    //   * It can be encoded with the special bitmask immediate encoding
                    // arm64_split() should have split other immediates that require multiple instructions.
                    match src {
                        Opnd::UImm(uimm) if *uimm <= 0xffff => {
                            movz(cb, dest.into(), A64Opnd::new_uimm(*uimm), 0);
                        },
                        _ => {
                            mov(cb, dest.into(), src.into());
                        }
                    }
                },
                Insn::Lea { opnd, out } => {
                    let &Opnd::Mem(Mem { num_bits: _, base: MemBase::Reg(base_reg_no), disp }) = opnd else {
                        panic!("Unexpected Insn::Lea operand in arm64_emit: {opnd:?}");
                    };
                    let out_reg_no = out.unwrap_reg().reg_no;
                    assert_ne!(31, out_reg_no, "Lea sp, [sp, #imm] not always encodable. Use add/sub instead.");

                    let out = A64Opnd::from(out);
                    let base_reg = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: base_reg_no });
                    if ShiftedImmediate::try_from(disp.unsigned_abs() as u64).is_ok() {
                        // Use ADD/SUB if the displacement fits
                        add(cb, out, base_reg, A64Opnd::new_imm(disp.into()));
                    } else {
                        // Use a scratch reg for `out += displacement`
                        let disp_reg = if out_reg_no == base_reg_no {
                            Self::EMIT_OPND
                        } else {
                            out
                        };
                        // Use add_extended() to interpret reg_no=31 as sp
                        // since the base register is never the zero register.
                        // Careful! Only the first two operands can refer to sp.
                        emit_load_value(cb, disp_reg, disp as u64);
                        add_extended(cb, out, base_reg, disp_reg);
                    }
                }
                Insn::LeaJumpTarget { out, target, .. } => {
                    if let Target::Label(label_idx) = target {
                        // Set output to the raw address of the label
                        cb.label_ref(*label_idx, 4, |cb, end_addr, dst_addr| {
                            adr(cb, Self::EMIT_OPND, A64Opnd::new_imm(dst_addr - (end_addr - 4)));
                        });

                        mov(cb, out.into(), Self::EMIT_OPND);
                    } else {
                        // Set output to the jump target's raw address
                        let target_code = target.unwrap_code_ptr();
                        let target_addr = target_code.raw_addr(cb).as_u64();
                        emit_load_value(cb, out.into(), target_addr);
                    }
                },
                Insn::CPush(opnd) => {
                    emit_push(cb, opnd.into());
                },
                Insn::CPop { out } => {
                    emit_pop(cb, out.into());
                },
                Insn::CPopInto(opnd) => {
                    emit_pop(cb, opnd.into());
                },
                Insn::CPushAll => {
                    let regs = Assembler::get_caller_save_regs();

                    for reg in regs {
                        emit_push(cb, A64Opnd::Reg(reg));
                    }

                    // Push the flags/state register
                    mrs(cb, Self::EMIT_OPND, SystemRegister::NZCV);
                    emit_push(cb, Self::EMIT_OPND);
                },
                Insn::CPopAll => {
                    let regs = Assembler::get_caller_save_regs();

                    // Pop the state/flags register
                    msr(cb, SystemRegister::NZCV, Self::EMIT_OPND);
                    emit_pop(cb, Self::EMIT_OPND);

                    for reg in regs.into_iter().rev() {
                        emit_pop(cb, A64Opnd::Reg(reg));
                    }
                },
                Insn::CCall { fptr, .. } => {
                    match fptr {
                        Opnd::UImm(fptr) => {
                            // The offset to the call target in bytes
                            let src_addr = cb.get_write_ptr().raw_ptr(cb) as i64;
                            let dst_addr = *fptr as i64;

                            // Use BL if the offset is short enough to encode as an immediate.
                            // Otherwise, use BLR with a register.
                            if b_offset_fits_bits((dst_addr - src_addr) / 4) {
                                bl(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32));
                            } else {
                                emit_load_value(cb, Self::EMIT_OPND, dst_addr as u64);
                                blr(cb, Self::EMIT_OPND);
                            }
                        }
                        Opnd::Reg(_) => {
                            blr(cb, fptr.into());
                        }
                        _ => unreachable!("unsupported ccall fptr: {fptr:?}")
                    }
                },
                Insn::CRet { .. } => {
                    ret(cb, A64Opnd::None);
                },
                Insn::Cmp { left, right } => {
                    cmp(cb, left.into(), right.into());
                },
                Insn::Test { left, right } => {
                    tst(cb, left.into(), right.into());
                },
                Insn::JmpOpnd(opnd) => {
                    br(cb, opnd.into());
                },
                Insn::Jmp(target) => {
                    match *target {
                        Target::CodePtr(dst_ptr) => {
                            emit_jmp_ptr(cb, dst_ptr, true);
                        },
                        Target::Label(label_idx) => {
                            // Here we're going to save enough space for
                            // ourselves and then come back and write the
                            // instruction once we know the offset. We're going
                            // to assume we can fit into a single b instruction.
                            // It will panic otherwise.
                            cb.label_ref(label_idx, 4, |cb, src_addr, dst_addr| {
                                let bytes: i32 = (dst_addr - (src_addr - 4)).try_into().unwrap();
                                b(cb, InstructionOffset::from_bytes(bytes));
                            });
                        },
                        Target::SideExit { .. } => {
                            unreachable!("Target::SideExit should have been compiled by compile_exits")
                        },
                    };
                },
                Insn::Je(target) | Insn::Jz(target) => {
                    emit_conditional_jump::<{Condition::EQ}>(cb, target.clone());
                },
                Insn::Jne(target) | Insn::Jnz(target) | Insn::JoMul(target) => {
                    emit_conditional_jump::<{Condition::NE}>(cb, target.clone());
                },
                Insn::Jl(target) => {
                    emit_conditional_jump::<{Condition::LT}>(cb, target.clone());
                },
                Insn::Jg(target) => {
                    emit_conditional_jump::<{Condition::GT}>(cb, target.clone());
                },
                Insn::Jge(target) => {
                    emit_conditional_jump::<{Condition::GE}>(cb, target.clone());
                },
                Insn::Jbe(target) => {
                    emit_conditional_jump::<{Condition::LS}>(cb, target.clone());
                },
                Insn::Jb(target) => {
                    emit_conditional_jump::<{Condition::CC}>(cb, target.clone());
                },
                Insn::Jo(target) => {
                    emit_conditional_jump::<{Condition::VS}>(cb, target.clone());
                },
                Insn::Joz(opnd, target) => {
                    emit_cmp_zero_jump(cb, opnd.into(), true, target.clone());
                },
                Insn::Jonz(opnd, target) => {
                    emit_cmp_zero_jump(cb, opnd.into(), false, target.clone());
                },
                Insn::PatchPoint { .. } => unreachable!("PatchPoint should have been lowered to PadPatchPoint in arm64_scratch_split"),
                Insn::PadPatchPoint => {
                    // If patch points are too close to each other or the end of the block, fill nop instructions
                    if let Some(last_patch_pos) = last_patch_pos {
                        while cb.get_write_pos().saturating_sub(last_patch_pos) < cb.jmp_ptr_bytes() && !cb.has_dropped_bytes() {
                            nop(cb);
                        }
                    }
                    last_patch_pos = Some(cb.get_write_pos());
                },
                Insn::IncrCounter { mem, value } => {
                    // Get the status register allocated by arm64_scratch_split
                    let Some(Insn::Cmp {
                        left: status_reg @ Opnd::Reg(_),
                        right: Opnd::UImm(_) | Opnd::Imm(_),
                    }) = self.insns.get(insn_idx + 1) else {
                        panic!("arm64_scratch_split should add Cmp after IncrCounter: {:?}", self.insns.get(insn_idx + 1));
                    };

                    // Attempt to increment a counter
                    ldaxr(cb, Self::EMIT_OPND, mem.into());
                    add(cb, Self::EMIT_OPND, Self::EMIT_OPND, value.into());

                    // The status register that gets used to track whether or
                    // not the store was successful must be 32 bytes. Since we
                    // store the EMIT registers as their 64-bit versions, we
                    // need to rewrap it here.
                    let status = A64Opnd::Reg(status_reg.unwrap_reg().with_num_bits(32));
                    stlxr(cb, status, Self::EMIT_OPND, mem.into());
                },
                Insn::Breakpoint => {
                    brk(cb, A64Opnd::None);
                },
                Insn::CSelZ { truthy, falsy, out } |
                Insn::CSelE { truthy, falsy, out } => {
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::EQ);
                },
                Insn::CSelNZ { truthy, falsy, out } |
                Insn::CSelNE { truthy, falsy, out } => {
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::NE);
                },
                Insn::CSelL { truthy, falsy, out } => {
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::LT);
                },
                Insn::CSelLE { truthy, falsy, out } => {
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::LE);
                },
                Insn::CSelG { truthy, falsy, out } => {
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::GT);
                },
                Insn::CSelGE { truthy, falsy, out } => {
                    csel(cb, out.into(), truthy.into(), falsy.into(), Condition::GE);
                }
                Insn::LiveReg { .. } => (), // just a reg alloc signal, no code
            };

            insn_idx += 1;
        }

        // Error if we couldn't write out everything
        if cb.has_dropped_bytes() {
            None
        } else {
            // No bytes dropped, so the pos markers point to valid code
            for (insn_idx, pos) in pos_markers {
                if let Insn::PosMarker(callback) = self.insns.get(insn_idx).unwrap() {
                    callback(pos, cb);
                } else {
                    panic!("non-PosMarker in pos_markers insn_idx={insn_idx} {self:?}");
                }
            }

            Some(gc_offsets)
        }
    }

    /// Optimize and compile the stored instructions
    pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec<Reg>) -> Result<(CodePtr, Vec<CodePtr>), CompileError> {
        // The backend is allowed to use scratch registers only if it has not accepted them so far.
        let use_scratch_reg = !self.accept_scratch_reg;
        asm_dump!(self, init);

        let asm = self.arm64_split();
        asm_dump!(asm, split);

        let mut asm = asm.alloc_regs(regs)?;
        asm_dump!(asm, alloc_regs);

        // We put compile_exits after alloc_regs to avoid extending live ranges for VRegs spilled on side exits.
        asm.compile_exits();
        asm_dump!(asm, compile_exits);

        if use_scratch_reg {
            asm = asm.arm64_scratch_split();
            asm_dump!(asm, scratch_split);
        }

        // Create label instances in the code block
        for (idx, name) in asm.label_names.iter().enumerate() {
            let label = cb.new_label(name.to_string());
            assert_eq!(label, Label(idx));
        }

        let start_ptr = cb.get_write_ptr();
        let gc_offsets = asm.arm64_emit(cb);

        if let (Some(gc_offsets), false) = (gc_offsets, cb.has_dropped_bytes()) {
            cb.link_labels();

            // Invalidate icache for newly written out region so we don't run stale code.
            unsafe { rb_jit_icache_invalidate(start_ptr.raw_ptr(cb) as _, cb.get_write_ptr().raw_ptr(cb) as _) };

            Ok((start_ptr, gc_offsets))
        } else {
            cb.clear_labels();
            Err(CompileError::OutOfMemory)
        }
    }
}

/// LIR Instructions that are lowered to an instruction that have 2 input registers and an output
/// register can look to merge with a succeeding `Insn::Mov`.
/// For example:
///
///     Add out, a, b
///     Mov c, out
///
/// Can become:
///
///     Add c, a, b
///
/// If a, b, and c are all registers.
fn merge_three_reg_mov(
    live_ranges: &[LiveRange],
    iterator: &mut InsnIter,
    asm: &mut Assembler,
    left: &Opnd,
    right: &Opnd,
    out: &mut Opnd,
) {
    if let (Opnd::Reg(_) | Opnd::VReg{..},
            Opnd::Reg(_) | Opnd::VReg{..},
            Some((mov_idx, Insn::Mov { dest, src })))
            = (left, right, iterator.peek()) {
        if out == src && live_ranges[out.vreg_idx()].end() == *mov_idx && matches!(*dest, Opnd::Reg(_) | Opnd::VReg{..}) {
            *out = *dest;
            iterator.next(asm); // Pop merged Insn::Mov
        }
    }
}

#[cfg(test)]
mod tests {
    #[cfg(feature = "disasm")]
    use crate::disasms_with;
    use crate::{assert_disasm_snapshot, hexdumps};

    use super::*;
    use insta::assert_snapshot;

    static TEMP_REGS: [Reg; 5] = [X1_REG, X9_REG, X10_REG, X14_REG, X15_REG];

    fn setup_asm() -> (Assembler, CodeBlock) {
        crate::options::rb_zjit_prepare_options(); // Allow `get_option!` in Assembler
        (Assembler::new(), CodeBlock::new_dummy())
    }

    #[test]
    fn test_lir_string() {
        use crate::hir::SideExitReason;

        let mut asm = Assembler::new();
        asm.stack_base_idx = 1;

        let label = asm.new_label("bb0");
        asm.write_label(label.clone());
        asm.push_insn(Insn::Comment("bb0(): foo@/tmp/a.rb:1".into()));
        asm.frame_setup(JIT_PRESERVED_REGS);

        let val64 = asm.add(CFP, Opnd::UImm(64));
        asm.store(Opnd::mem(64, SP, 0x10), val64);
        let side_exit = Target::SideExit { reason: SideExitReason::Interrupt, exit: SideExit { pc: Opnd::const_ptr(0 as *const u8), stack: vec![], locals: vec![] } };
        asm.push_insn(Insn::Joz(val64, side_exit));
        asm.parallel_mov(vec![(C_ARG_OPNDS[0], C_RET_OPND.with_num_bits(32)), (C_ARG_OPNDS[1], Opnd::mem(64, SP, -8))]);

        let val32 = asm.sub(Opnd::Value(Qtrue), Opnd::Imm(1));
        asm.store(Opnd::mem(64, EC, 0x10).with_num_bits(32), val32.with_num_bits(32));
        asm.je(label);
        asm.cret(val64);

        asm.frame_teardown(JIT_PRESERVED_REGS);
        assert_disasm_snapshot!(lir_string(&mut asm), @r"
        bb0:
          # bb0(): foo@/tmp/a.rb:1
          FrameSetup 1, x19, x21, x20
          v0 = Add x19, 0x40
          Store [x21 + 0x10], v0
          Joz Exit(Interrupt), v0
          ParallelMov x0 <- w0, x1 <- [x21 - 8]
          v1 = Sub Value(0x14), Imm(1)
          Store Mem32[x20 + 0x10], VReg32(v1)
          Je bb0
          CRet v0
          FrameTeardown x19, x21, x20
        ");
    }

    #[test]
    fn test_mul_with_immediate() {
        let (mut asm, mut cb) = setup_asm();

        let out = asm.mul(Opnd::Reg(TEMP_REGS[1]), 3.into());
        asm.mov(Opnd::Reg(TEMP_REGS[0]), out);
        asm.compile_with_num_regs(&mut cb, 2);

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x0, #3
            0x4: mul x0, x9, x0
            0x8: mov x1, x0
        ");
        assert_snapshot!(cb.hexdump(), @"600080d2207d009be10300aa");
    }

    #[test]
    fn sp_movements_are_single_instruction() {
        let (mut asm, mut cb) = setup_asm();

        let sp = Opnd::Reg(XZR_REG);
        let new_sp = asm.add(sp, 0x20.into());
        asm.mov(sp, new_sp);
        let new_sp = asm.sub(sp, 0x20.into());
        asm.mov(sp, new_sp);

        asm.compile_with_num_regs(&mut cb, 2);
        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: add sp, sp, #0x20
            0x4: sub sp, sp, #0x20
        ");
        assert_snapshot!(cb.hexdump(), @"ff830091ff8300d1");
    }

    #[test]
    fn add_into() {
        let (mut asm, mut cb) = setup_asm();

        let sp = Opnd::Reg(XZR_REG);
        asm.add_into(sp, 8.into());
        asm.add_into(Opnd::Reg(X20_REG), 0x20.into());

        asm.compile_with_num_regs(&mut cb, 0);
        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: add sp, sp, #8
            0x4: adds x20, x20, #0x20
        ");
        assert_snapshot!(cb.hexdump(), @"ff230091948200b1");
    }

    #[test]
    fn sub_imm_reg() {
        let (mut asm, mut cb) = setup_asm();

        let difference = asm.sub(0x8.into(), Opnd::Reg(X5_REG));
        asm.load_into(Opnd::Reg(X1_REG), difference);

        asm.compile_with_num_regs(&mut cb, 1);
        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x0, #8
            0x4: subs x0, x0, x5
            0x8: mov x1, x0
        ");
        assert_snapshot!(cb.hexdump(), @"000180d2000005ebe10300aa");
    }

    #[test]
    fn no_dead_mov_from_vreg() {
        let (mut asm, mut cb) = setup_asm();

        let ret_val = asm.load(Opnd::mem(64, C_RET_OPND, 0));
        asm.cret(ret_val);

        asm.compile_with_num_regs(&mut cb, 1);
        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: ldur x0, [x0]
            0x4: ret
        ");
        assert_snapshot!(cb.hexdump(), @"000040f8c0035fd6");
    }

    #[test]
    fn test_emit_add() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.add(Opnd::Reg(X0_REG), Opnd::Reg(X1_REG));
        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
        asm.compile_with_regs(&mut cb, vec![X3_REG]).unwrap();

        // Assert that only 2 instructions were written.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: adds x3, x0, x1
        0x4: stur x3, [x2]
        ");
        assert_snapshot!(cb.hexdump(), @"030001ab430000f8");
    }

    #[test]
    fn test_emit_bake_string() {
        let (mut asm, mut cb) = setup_asm();

        asm.bake_string("Hello, world!");
        asm.compile_with_num_regs(&mut cb, 0);

        // Testing that we pad the string to the nearest 4-byte boundary to make
        // it easier to jump over.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: ldnp d8, d25, [x10, #-0x140]
        0x4: .byte 0x6f, 0x2c, 0x20, 0x77
        0x8: .byte 0x6f, 0x72, 0x6c, 0x64
        0xc: .byte 0x21, 0x00, 0x00, 0x00
        ");
        assert_snapshot!(cb.hexdump(), @"48656c6c6f2c20776f726c6421000000");
    }

    #[test]
    fn test_emit_cpush_all() {
        let (mut asm, mut cb) = setup_asm();

        asm.cpush_all();
        asm.compile_with_num_regs(&mut cb, 0);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: str x1, [sp, #-0x10]!
        0x4: str x9, [sp, #-0x10]!
        0x8: str x10, [sp, #-0x10]!
        0xc: str x11, [sp, #-0x10]!
        0x10: str x12, [sp, #-0x10]!
        0x14: str x13, [sp, #-0x10]!
        0x18: str x14, [sp, #-0x10]!
        0x1c: str x15, [sp, #-0x10]!
        0x20: mrs x16, nzcv
        0x24: str x16, [sp, #-0x10]!
        ");
        assert_snapshot!(cb.hexdump(), @"e10f1ff8e90f1ff8ea0f1ff8eb0f1ff8ec0f1ff8ed0f1ff8ee0f1ff8ef0f1ff810423bd5f00f1ff8");
    }

    #[test]
    fn test_emit_cpop_all() {
        let (mut asm, mut cb) = setup_asm();

        asm.cpop_all();
        asm.compile_with_num_regs(&mut cb, 0);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: msr nzcv, x16
        0x4: ldr x16, [sp], #0x10
        0x8: ldr x15, [sp], #0x10
        0xc: ldr x14, [sp], #0x10
        0x10: ldr x13, [sp], #0x10
        0x14: ldr x12, [sp], #0x10
        0x18: ldr x11, [sp], #0x10
        0x1c: ldr x10, [sp], #0x10
        0x20: ldr x9, [sp], #0x10
        0x24: ldr x1, [sp], #0x10
        ");
        assert_snapshot!(cb.hexdump(), @"10421bd5f00741f8ef0741f8ee0741f8ed0741f8ec0741f8eb0741f8ea0741f8e90741f8e10741f8");
    }

    #[test]
    fn test_emit_frame() {
        let (mut asm, mut cb) = setup_asm();

        asm.frame_setup(&[]);
        asm.frame_teardown(&[]);
        asm.compile_with_num_regs(&mut cb, 0);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: stp x29, x30, [sp, #-0x10]!
        0x4: mov x29, sp
        0x8: mov sp, x29
        0xc: ldp x29, x30, [sp], #0x10
        ");
        assert_snapshot!(cb.hexdump(), @"fd7bbfa9fd030091bf030091fd7bc1a8");
    }

    #[test]
    fn frame_setup_and_teardown() {
        const THREE_REGS: &[Opnd] = &[Opnd::Reg(X19_REG), Opnd::Reg(X20_REG), Opnd::Reg(X21_REG)];
        // Test 3 preserved regs (odd), odd slot_count
        let cb1 = {
            let (mut asm, mut cb) = setup_asm();
            asm.stack_base_idx = 3;
            asm.frame_setup(THREE_REGS);
            asm.frame_teardown(THREE_REGS);
            asm.compile_with_num_regs(&mut cb, 0);
            cb
        };

        // Test 3 preserved regs (odd), even slot_count
        let cb2 = {
            let (mut asm, mut cb) = setup_asm();
            asm.stack_base_idx = 4;
            asm.frame_setup(THREE_REGS);
            asm.frame_teardown(THREE_REGS);
            asm.compile_with_num_regs(&mut cb, 0);
            cb
        };

        // Test 4 preserved regs (even), odd slot_count
        let cb3 = {
            static FOUR_REGS: &[Opnd] = &[Opnd::Reg(X19_REG), Opnd::Reg(X20_REG), Opnd::Reg(X21_REG), Opnd::Reg(X22_REG)];
            let (mut asm, mut cb) = setup_asm();
            asm.stack_base_idx = 3;
            asm.frame_setup(FOUR_REGS);
            asm.frame_teardown(FOUR_REGS);
            asm.compile_with_num_regs(&mut cb, 0);
            cb
        };

        assert_disasm_snapshot!(disasms_with!("\n", cb1, cb2, cb3), @r"
        0x0: stp x29, x30, [sp, #-0x10]!
        0x4: mov x29, sp
        0x8: stp x20, x19, [sp, #-0x10]!
        0xc: stur x21, [sp, #-8]
        0x10: sub sp, sp, #0x20
        0x14: ldp x20, x19, [x29, #-0x10]
        0x18: ldur x21, [x29, #-0x18]
        0x1c: mov sp, x29
        0x20: ldp x29, x30, [sp], #0x10

        0x0: stp x29, x30, [sp, #-0x10]!
        0x4: mov x29, sp
        0x8: stp x20, x19, [sp, #-0x10]!
        0xc: stur x21, [sp, #-8]
        0x10: sub sp, sp, #0x30
        0x14: ldp x20, x19, [x29, #-0x10]
        0x18: ldur x21, [x29, #-0x18]
        0x1c: mov sp, x29
        0x20: ldp x29, x30, [sp], #0x10

        0x0: stp x29, x30, [sp, #-0x10]!
        0x4: mov x29, sp
        0x8: stp x20, x19, [sp, #-0x10]!
        0xc: stp x22, x21, [sp, #-0x10]!
        0x10: sub sp, sp, #0x20
        0x14: ldp x20, x19, [x29, #-0x10]
        0x18: ldp x22, x21, [x29, #-0x20]
        0x1c: mov sp, x29
        0x20: ldp x29, x30, [sp], #0x10
        ");
        assert_snapshot!(hexdumps!(cb1, cb2, cb3), @r"
        fd7bbfa9fd030091f44fbfa9f5831ff8ff8300d1b44f7fa9b5835ef8bf030091fd7bc1a8
        fd7bbfa9fd030091f44fbfa9f5831ff8ffc300d1b44f7fa9b5835ef8bf030091fd7bc1a8
        fd7bbfa9fd030091f44fbfa9f657bfa9ff8300d1b44f7fa9b6577ea9bf030091fd7bc1a8
        ");
    }

    #[test]
    fn test_emit_je_fits_into_bcond() {
        let (mut asm, mut cb) = setup_asm();

        let target: CodePtr = cb.get_write_ptr().add_bytes(80);

        asm.je(Target::CodePtr(target));
        asm.compile_with_num_regs(&mut cb, 0);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: b.eq #0x50
        0x4: nop
        0x8: nop
        0xc: nop
        0x10: nop
        0x14: nop
        ");
        assert_snapshot!(cb.hexdump(), @"800200541f2003d51f2003d51f2003d51f2003d51f2003d5");
    }

    #[test]
    fn test_emit_je_does_not_fit_into_bcond() {
        let (mut asm, mut cb) = setup_asm();

        let offset = 1 << 21;
        let target: CodePtr = cb.get_write_ptr().add_bytes(offset);

        asm.je(Target::CodePtr(target));
        asm.compile_with_num_regs(&mut cb, 0);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: b.ne #8
        0x4: b #0x200000
        0x8: nop
        0xc: nop
        0x10: nop
        0x14: nop
        ");
        assert_snapshot!(cb.hexdump(), @"41000054ffff07141f2003d51f2003d51f2003d51f2003d5");
    }

    #[test]
    fn test_emit_lea() {
        let (mut asm, mut cb) = setup_asm();

        // Test values that exercise various types of immediates.
        //  - 9 bit displacement for Load/Store
        //  - 12 bit ADD/SUB shifted immediate
        //  - 16 bit MOV family shifted immediates
        //  - bit mask immediates
        for displacement in [i32::MAX, 0x10008, 0x1800, 0x208, -0x208, -0x1800, -0x10008, i32::MIN] {
            let mem = Opnd::mem(64, NATIVE_STACK_PTR, displacement);
            asm.lea_into(Opnd::Reg(X0_REG), mem);
        }

        asm.compile_with_num_regs(&mut cb, 0);
        assert_disasm_snapshot!(cb.disasm(), @r"
            0x0: orr x0, xzr, #0x7fffffff
            0x4: add x0, sp, x0
            0x8: mov x0, #8
            0xc: movk x0, #1, lsl #16
            0x10: add x0, sp, x0
            0x14: mov x0, #0x1800
            0x18: add x0, sp, x0
            0x1c: add x0, sp, #0x208
            0x20: sub x0, sp, #0x208
            0x24: mov x0, #-0x1800
            0x28: add x0, sp, x0
            0x2c: mov x0, #0xfff8
            0x30: movk x0, #0xfffe, lsl #16
            0x34: movk x0, #0xffff, lsl #32
            0x38: movk x0, #0xffff, lsl #48
            0x3c: add x0, sp, x0
            0x40: orr x0, xzr, #0xffffffff80000000
            0x44: add x0, sp, x0
        ");
        assert_snapshot!(cb.hexdump(), @"e07b40b2e063208b000180d22000a0f2e063208b000083d2e063208be0230891e02308d1e0ff8292e063208b00ff9fd2c0ffbff2e0ffdff2e0fffff2e063208be08361b2e063208b");
    }

    #[test]
    fn test_load_larg_disp_mem() {
        let (mut asm, mut cb) = setup_asm();

        let extended_ivars = asm.load(Opnd::mem(64, NATIVE_STACK_PTR, 0));
        let result = asm.load(Opnd::mem(VALUE_BITS, extended_ivars, 1000 * SIZEOF_VALUE_I32));
        asm.store(Opnd::mem(VALUE_BITS, NATIVE_STACK_PTR, 0), result);

        asm.compile_with_num_regs(&mut cb, 1);
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: ldur x0, [sp]
        0x4: mov x16, #0x1f40
        0x8: add x0, x0, x16, uxtx
        0xc: ldur x0, [x0]
        0x10: stur x0, [sp]
        ");
        assert_snapshot!(cb.hexdump(), @"e00340f810e883d20060308b000040f8e00300f8");
    }

    #[test]
    fn test_store() {
        let (mut asm, mut cb) = setup_asm();

        // Large memory offsets in combinations of destination and source
        let large_mem = Opnd::mem(64, NATIVE_STACK_PTR, -0x305);
        let small_mem = Opnd::mem(64, C_RET_OPND, 0);
        asm.store(small_mem, large_mem);
        asm.store(large_mem, small_mem);
        asm.store(large_mem, large_mem);

        asm.compile_with_num_regs(&mut cb, 0);
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: sub x16, sp, #0x305
        0x4: ldur x16, [x16]
        0x8: stur x16, [x0]
        0xc: sub x15, sp, #0x305
        0x10: ldur x16, [x0]
        0x14: stur x16, [x15]
        0x18: sub x15, sp, #0x305
        0x1c: sub x16, sp, #0x305
        0x20: ldur x16, [x16]
        0x24: stur x16, [x15]
        ");
        assert_snapshot!(cb.hexdump(), @"f0170cd1100240f8100000f8ef170cd1100040f8f00100f8ef170cd1f0170cd1100240f8f00100f8");
    }

    #[test]
    fn test_store_value_without_split() {
        let (mut asm, mut cb) = setup_asm();

        let imitation_heap_value = VALUE(0x1000);
        assert!(imitation_heap_value.heap_object_p());
        asm.store(Opnd::mem(VALUE_BITS, SP, 0), imitation_heap_value.into());

        // Side exit code are compiled without the split pass, so we directly call emit here to
        // emulate that scenario.
        let gc_offsets = asm.arm64_emit(&mut cb).unwrap();
        assert_eq!(1, gc_offsets.len(), "VALUE source operand should be reported as gc offset");

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: ldr x16, #8
            0x4: b #0x10
            0x8: .byte 0x00, 0x10, 0x00, 0x00
            0xc: .byte 0x00, 0x00, 0x00, 0x00
            0x10: stur x16, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"50000058030000140010000000000000b00200f8");
    }

    #[test]
    fn test_store_with_valid_scratch_reg() {
        let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg();
        let mut cb = CodeBlock::new_dummy();
        asm.store(Opnd::mem(64, scratch_reg, 0), 0x83902.into());

        asm.compile_with_num_regs(&mut cb, 0);
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x16, #0x3902
        0x4: movk x16, #8, lsl #16
        0x8: stur x16, [x15]
        ");
        assert_snapshot!(cb.hexdump(), @"502087d21001a0f2f00100f8");
    }

    #[test]
    #[should_panic]
    fn test_store_with_invalid_scratch_reg() {
        let (_, scratch_reg) = Assembler::new_with_scratch_reg();
        let (mut asm, mut cb) = setup_asm();
        // This would put the source into scratch_reg, messing up the destination
        asm.store(Opnd::mem(64, scratch_reg, 0), 0x83902.into());

        asm.compile_with_num_regs(&mut cb, 0);
    }

    #[test]
    #[should_panic]
    fn test_load_into_with_invalid_scratch_reg() {
        let (_, scratch_reg) = Assembler::new_with_scratch_reg();
        let (mut asm, mut cb) = setup_asm();
        // This would put the source into scratch_reg, messing up the destination
        asm.load_into(scratch_reg, 0x83902.into());

        asm.compile_with_num_regs(&mut cb, 0);
    }

    #[test]
    fn test_emit_lea_label() {
        let (mut asm, mut cb) = setup_asm();

        let label = asm.new_label("label");
        let opnd = asm.lea_jump_target(label.clone());

        asm.write_label(label);
        asm.bake_string("Hello, world!");
        asm.store(Opnd::mem(64, SP, 0), opnd);

        asm.compile_with_num_regs(&mut cb, 1);
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: adr x16, #8
        0x4: mov x0, x16
        0x8: ldnp d8, d25, [x10, #-0x140]
        0xc: .byte 0x6f, 0x2c, 0x20, 0x77
        0x10: .byte 0x6f, 0x72, 0x6c, 0x64
        0x14: .byte 0x21, 0x00, 0x00, 0x00
        0x18: stur x0, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"50000010e00310aa48656c6c6f2c20776f726c6421000000a00200f8");
    }

    #[test]
    fn test_emit_load_mem_disp_fits_into_load() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.load(Opnd::mem(64, SP, 0));
        asm.store(Opnd::mem(64, SP, 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that two instructions were written: LDUR and STUR.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: ldur x0, [x21]
        0x4: stur x0, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"a00240f8a00200f8");
    }

    #[test]
    fn test_emit_load_mem_disp_fits_into_add() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.load(Opnd::mem(64, SP, 1 << 10));
        asm.store(Opnd::mem(64, SP, 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that three instructions were written: ADD, LDUR, and STUR.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: add x0, x21, #0x400
        0x4: ldur x0, [x0]
        0x8: stur x0, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"a0021091000040f8a00200f8");
    }

    #[test]
    fn test_emit_load_mem_disp_does_not_fit_into_add() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.load(Opnd::mem(64, SP, 1 << 12 | 1));
        asm.store(Opnd::mem(64, SP, 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that three instructions were written: MOVZ, ADD, LDUR, and STUR.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x0, #0x1001
        0x4: add x0, x21, x0, uxtx
        0x8: ldur x0, [x0]
        0xc: stur x0, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"200082d2a062208b000040f8a00200f8");
    }

    #[test]
    fn test_emit_load_value_immediate() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.load(Opnd::Value(Qnil));
        asm.store(Opnd::mem(64, SP, 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that only two instructions were written since the value is an
        // immediate.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x0, #4
        0x4: stur x0, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"800080d2a00200f8");
    }

    #[test]
    fn test_emit_load_value_non_immediate() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.load(Opnd::Value(VALUE(0xCAFECAFECAFE0000)));
        asm.store(Opnd::mem(64, SP, 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that five instructions were written since the value is not an
        // immediate and needs to be loaded into a register.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: ldr x0, #8
        0x4: b #0x10
        0x8: eon x0, x0, x30, ror #0
        0xc: eon x30, x23, x30, ror #50
        0x10: stur x0, [x21]
        ");
        assert_snapshot!(cb.hexdump(), @"40000058030000140000fecafecafecaa00200f8");
    }

    #[test]
    fn test_emit_test_32b_reg_not_bitmask_imm() {
        let (mut asm, mut cb) = setup_asm();
        let w0 = Opnd::Reg(X0_REG).with_num_bits(32);
        asm.test(w0, Opnd::UImm(u32::MAX.into()));
        // All ones is not encodable with a bitmask immediate,
        // so this needs one register
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: orr x0, xzr, #0xffffffff
        0x4: tst w0, w0
        ");
        assert_snapshot!(cb.hexdump(), @"e07f40b21f00006a");
    }

    #[test]
    fn test_emit_test_32b_reg_bitmask_imm() {
        let (mut asm, mut cb) = setup_asm();
        let w0 = Opnd::Reg(X0_REG).with_num_bits(32);
        asm.test(w0, Opnd::UImm(0x80000001));
        asm.compile_with_num_regs(&mut cb, 0);

        assert_disasm_snapshot!(cb.disasm(), @"  0x0: tst w0, #0x80000001");
        assert_snapshot!(cb.hexdump(), @"1f040172");
    }

    #[test]
    fn test_emit_or() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.or(Opnd::Reg(X0_REG), Opnd::Reg(X1_REG));
        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: orr x0, x0, x1
        0x4: stur x0, [x2]
        ");
        assert_snapshot!(cb.hexdump(), @"000001aa400000f8");
    }

    #[test]
    fn test_emit_lshift() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.lshift(Opnd::Reg(X0_REG), Opnd::UImm(5));
        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: lsl x0, x0, #5
        0x4: stur x0, [x2]
        ");
        assert_snapshot!(cb.hexdump(), @"00e87bd3400000f8");
    }

    #[test]
    fn test_emit_rshift() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.rshift(Opnd::Reg(X0_REG), Opnd::UImm(5));
        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: asr x0, x0, #5
        0x4: stur x0, [x2]
        ");
        assert_snapshot!(cb.hexdump(), @"00fc4593400000f8");
    }

    #[test]
    fn test_emit_urshift() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.urshift(Opnd::Reg(X0_REG), Opnd::UImm(5));
        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: lsr x0, x0, #5
        0x4: stur x0, [x2]
        ");
        assert_snapshot!(cb.hexdump(), @"00fc45d3400000f8");
    }

    #[test]
    fn test_emit_test() {
        let (mut asm, mut cb) = setup_asm();

        asm.test(Opnd::Reg(X0_REG), Opnd::Reg(X1_REG));
        asm.compile_with_num_regs(&mut cb, 0);

        // Assert that only one instruction was written.
        assert_disasm_snapshot!(cb.disasm(), @"  0x0: tst x0, x1");
        assert_snapshot!(cb.hexdump(), @"1f0001ea");
    }

    #[test]
    fn test_emit_test_with_encodable_unsigned_immediate() {
        let (mut asm, mut cb) = setup_asm();

        asm.test(Opnd::Reg(X0_REG), Opnd::UImm(7));
        asm.compile_with_num_regs(&mut cb, 0);

        // Assert that only one instruction was written.
        assert_disasm_snapshot!(cb.disasm(), @"  0x0: tst x0, #7");
        assert_snapshot!(cb.hexdump(), @"1f0840f2");
    }

    #[test]
    fn test_emit_test_with_unencodable_unsigned_immediate() {
        let (mut asm, mut cb) = setup_asm();

        asm.test(Opnd::Reg(X0_REG), Opnd::UImm(5));
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that a load and a test instruction were written.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x0, #5
        0x4: tst x0, x0
        ");
        assert_snapshot!(cb.hexdump(), @"a00080d21f0000ea");
    }

    #[test]
    fn test_emit_test_with_encodable_signed_immediate() {
        let (mut asm, mut cb) = setup_asm();

        asm.test(Opnd::Reg(X0_REG), Opnd::Imm(7));
        asm.compile_with_num_regs(&mut cb, 0);

        // Assert that only one instruction was written.
        assert_disasm_snapshot!(cb.disasm(), @"  0x0: tst x0, #7");
        assert_snapshot!(cb.hexdump(), @"1f0840f2");
    }

    #[test]
    fn test_emit_test_with_unencodable_signed_immediate() {
        let (mut asm, mut cb) = setup_asm();

        asm.test(Opnd::Reg(X0_REG), Opnd::Imm(5));
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that a load and a test instruction were written.
        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x0, #5
        0x4: tst x0, x0
        ");
        assert_snapshot!(cb.hexdump(), @"a00080d21f0000ea");
    }

    #[test]
    fn test_emit_test_with_negative_signed_immediate() {
        let (mut asm, mut cb) = setup_asm();

        asm.test(Opnd::Reg(X0_REG), Opnd::Imm(-7));
        asm.compile_with_num_regs(&mut cb, 1);

        // Assert that a test instruction is written.
        assert_disasm_snapshot!(cb.disasm(), @"  0x0: tst x0, #-7");
        assert_snapshot!(cb.hexdump(), @"1ff47df2");
    }

    #[test]
    fn test_32_bit_register_with_some_number() {
        let (mut asm, mut cb) = setup_asm();

        let shape_opnd = Opnd::mem(32, Opnd::Reg(X0_REG), 6);
        asm.cmp(shape_opnd, Opnd::UImm(4097));
        asm.compile_with_num_regs(&mut cb, 2);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: ldur w0, [x0, #6]
        0x4: mov x1, #0x1001
        0x8: cmp w0, w1
        ");
        assert_snapshot!(cb.hexdump(), @"006040b8210082d21f00016b");
    }

    #[test]
    fn test_16_bit_register_store_some_number() {
        let (mut asm, mut cb) = setup_asm();

        let shape_opnd = Opnd::mem(16, Opnd::Reg(X0_REG), 0);
        asm.store(shape_opnd, Opnd::UImm(4097));
        asm.compile_with_num_regs(&mut cb, 2);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x16, #0x1001
        0x4: sturh w16, [x0]
        ");
        assert_snapshot!(cb.hexdump(), @"300082d210000078");
    }

    #[test]
    fn test_32_bit_register_store_some_number() {
        let (mut asm, mut cb) = setup_asm();

        let shape_opnd = Opnd::mem(32, Opnd::Reg(X0_REG), 6);
        asm.store(shape_opnd, Opnd::UImm(4097));
        asm.compile_with_num_regs(&mut cb, 2);

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x16, #0x1001
        0x4: stur w16, [x0, #6]
        ");
        assert_snapshot!(cb.hexdump(), @"300082d2106000b8");
    }

    #[test]
    fn test_emit_xor() {
        let (mut asm, mut cb) = setup_asm();

        let opnd = asm.xor(Opnd::Reg(X0_REG), Opnd::Reg(X1_REG));
        asm.store(Opnd::mem(64, Opnd::Reg(X2_REG), 0), opnd);

        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: eor x0, x0, x1
            0x4: stur x0, [x2]
        ");
        assert_snapshot!(cb.hexdump(), @"000001ca400000f8");
    }

    #[test]
    #[cfg(feature = "disasm")]
    fn test_simple_disasm() -> std::result::Result<(), capstone::Error> {
        // Test drive Capstone with simple input
        use capstone::prelude::*;

        let cs = Capstone::new()
            .arm64()
            .mode(arch::arm64::ArchMode::Arm)
            .build()?;

        let insns = cs.disasm_all(&[0x60, 0x0f, 0x80, 0xF2], 0x1000)?;

        match insns.as_ref() {
            [insn] => {
                assert_eq!(Some("movk"), insn.mnemonic());
                Ok(())
            }
            _ => Err(capstone::Error::CustomError(
                "expected to disassemble to movk",
            )),
        }
    }

    #[test]
    fn test_replace_mov_with_ldur() {
        let (mut asm, mut cb) = setup_asm();

        asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::mem(64, CFP, 8));
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @"  0x0: ldur x1, [x19, #8]");
        assert_snapshot!(cb.hexdump(), @"618240f8");
    }

    #[test]
    fn test_not_split_mov() {
        let (mut asm, mut cb) = setup_asm();

        asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::UImm(0xffff));
        asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::UImm(0x10000));
        asm.compile_with_num_regs(&mut cb, 1);

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x1, #0xffff
            0x4: orr x1, xzr, #0x10000
        ");
        assert_snapshot!(cb.hexdump(), @"e1ff9fd2e10370b2");
    }

    #[test]
    fn test_merge_csel_mov() {
        let (mut asm, mut cb) = setup_asm();

        let out = asm.csel_l(Qtrue.into(), Qfalse.into());
        asm.mov(Opnd::Reg(TEMP_REGS[0]), out);
        asm.compile_with_num_regs(&mut cb, 2);

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x0, #0x14
            0x4: mov x1, #0
            0x8: csel x1, x0, x1, lt
        ");
        assert_snapshot!(cb.hexdump(), @"800280d2010080d201b0819a");
    }

    #[test]
    fn test_label_branch_generate_bounds() {
        // The immediate in a conditional branch is a 19 bit unsigned integer
        // which has a max value of 2^18 - 1.
        const IMMEDIATE_MAX_VALUE: usize = 2usize.pow(18) - 1;

        // `IMMEDIATE_MAX_VALUE` number of dummy instructions will be generated
        // plus a compare, a jump instruction, and a label.
        // Adding page_size to avoid OOM on the last page.
        let page_size = unsafe { rb_jit_get_page_size() } as usize;
        let memory_required = (IMMEDIATE_MAX_VALUE + 8) * 4 + page_size;

        let mut asm = Assembler::new();
        let mut cb = CodeBlock::new_dummy_sized(memory_required);

        let far_label = asm.new_label("far");

        asm.cmp(Opnd::Reg(X0_REG), Opnd::UImm(1));
        asm.je(far_label.clone());

        (0..IMMEDIATE_MAX_VALUE).for_each(|_| {
            asm.mov(Opnd::Reg(TEMP_REGS[0]), Opnd::Reg(TEMP_REGS[2]));
        });

        asm.write_label(far_label.clone());
        asm.compile_with_num_regs(&mut cb, 1);
    }

    #[test]
    fn test_add_with_immediate() {
        let (mut asm, mut cb) = setup_asm();

        let out = asm.add(Opnd::Reg(TEMP_REGS[1]), 1.into());
        let out = asm.add(out, 1_usize.into());
        asm.mov(Opnd::Reg(TEMP_REGS[0]), out);
        asm.compile_with_num_regs(&mut cb, 2);

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: adds x0, x9, #1
            0x4: adds x1, x0, #1
        ");
        assert_snapshot!(cb.hexdump(), @"200500b1010400b1");
    }

    #[test]
    fn test_store_spilled_byte() {
        let (mut asm, mut cb) = setup_asm();

        asm.store(Opnd::mem(8, C_RET_OPND, 0), Opnd::mem(8, C_RET_OPND, 8));
        asm.compile_with_num_regs(&mut cb, 0); // spill every VReg

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: ldurb w16, [x0, #8]
        0x4: sturb w16, [x0]
        ");
        assert_snapshot!(cb.hexdump(), @"1080403810000038");
    }

    #[test]
    fn test_ccall_resolve_parallel_moves_no_cycle() {
        let (mut asm, mut cb) = setup_asm();

        asm.ccall(0 as _, vec![
            C_ARG_OPNDS[0], // mov x0, x0 (optimized away)
            C_ARG_OPNDS[1], // mov x1, x1 (optimized away)
        ]);
        asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len());

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x16, #0
            0x4: blr x16
        ");
        assert_snapshot!(cb.hexdump(), @"100080d200023fd6");
    }

    #[test]
    fn test_ccall_resolve_parallel_moves_single_cycle() {
        let (mut asm, mut cb) = setup_asm();

        // x0 and x1 form a cycle
        asm.ccall(0 as _, vec![
            C_ARG_OPNDS[1], // mov x0, x1
            C_ARG_OPNDS[0], // mov x1, x0
            C_ARG_OPNDS[2], // mov x2, x2 (optimized away)
        ]);
        asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len());

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x15, x0
            0x4: mov x0, x1
            0x8: mov x1, x15
            0xc: mov x16, #0
            0x10: blr x16
        ");
        assert_snapshot!(cb.hexdump(), @"ef0300aae00301aae1030faa100080d200023fd6");
    }

    #[test]
    fn test_ccall_resolve_parallel_moves_two_cycles() {
        let (mut asm, mut cb) = setup_asm();

        // x0 and x1 form a cycle, and x2 and rcx form another cycle
        asm.ccall(0 as _, vec![
            C_ARG_OPNDS[1], // mov x0, x1
            C_ARG_OPNDS[0], // mov x1, x0
            C_ARG_OPNDS[3], // mov x2, rcx
            C_ARG_OPNDS[2], // mov rcx, x2
        ]);
        asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len());

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x15, x2
            0x4: mov x2, x3
            0x8: mov x3, x15
            0xc: mov x15, x0
            0x10: mov x0, x1
            0x14: mov x1, x15
            0x18: mov x16, #0
            0x1c: blr x16
        ");
        assert_snapshot!(cb.hexdump(), @"ef0302aae20303aae3030faaef0300aae00301aae1030faa100080d200023fd6");
    }

    #[test]
    fn test_ccall_resolve_parallel_moves_large_cycle() {
        let (mut asm, mut cb) = setup_asm();

        // x0, x1, and x2 form a cycle
        asm.ccall(0 as _, vec![
            C_ARG_OPNDS[1], // mov x0, x1
            C_ARG_OPNDS[2], // mov x1, x2
            C_ARG_OPNDS[0], // mov x2, x0
        ]);
        asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len());

        assert_disasm_snapshot!(cb.disasm(), @"
            0x0: mov x15, x0
            0x4: mov x0, x1
            0x8: mov x1, x2
            0xc: mov x2, x15
            0x10: mov x16, #0
            0x14: blr x16
        ");
        assert_snapshot!(cb.hexdump(), @"ef0300aae00301aae10302aae2030faa100080d200023fd6");
    }

    #[test]
    fn test_split_spilled_lshift() {
        let (mut asm, mut cb) = setup_asm();

        let opnd_vreg = asm.load(1.into());
        let out_vreg = asm.lshift(opnd_vreg, Opnd::UImm(1));
        asm.mov(C_RET_OPND, out_vreg);
        asm.compile_with_num_regs(&mut cb, 0); // spill every VReg

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: mov x16, #1
        0x4: stur x16, [x29, #-8]
        0x8: ldur x15, [x29, #-8]
        0xc: lsl x15, x15, #1
        0x10: stur x15, [x29, #-8]
        0x14: ldur x0, [x29, #-8]
        ");
        assert_snapshot!(cb.hexdump(), @"300080d2b0831ff8af835ff8eff97fd3af831ff8a0835ff8");
    }

    #[test]
    fn test_split_load16_mem_mem_with_large_displacement() {
        let (mut asm, mut cb) = setup_asm();

        let _ = asm.load(Opnd::mem(16, C_RET_OPND, 0x200));
        asm.compile(&mut cb).unwrap();

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: add x0, x0, #0x200
        0x4: ldurh w0, [x0]
        ");
        assert_snapshot!(cb.hexdump(), @"0000089100004078");
    }

    #[test]
    fn test_split_load32_mem_mem_with_large_displacement() {
        let (mut asm, mut cb) = setup_asm();

        let _ = asm.load(Opnd::mem(32, C_RET_OPND, 0x200));
        asm.compile(&mut cb).unwrap();

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: add x0, x0, #0x200
        0x4: ldur w0, [x0]
        ");
        assert_snapshot!(cb.hexdump(), @"00000891000040b8");
    }

    #[test]
    fn test_split_load64_mem_mem_with_large_displacement() {
        let (mut asm, mut cb) = setup_asm();

        let _ = asm.load(Opnd::mem(64, C_RET_OPND, 0x200));
        asm.compile(&mut cb).unwrap();

        assert_disasm_snapshot!(cb.disasm(), @r"
        0x0: add x0, x0, #0x200
        0x4: ldur x0, [x0]
        ");
        assert_snapshot!(cb.hexdump(), @"00000891000040f8");
    }
}
